Index: head/contrib/llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- head/contrib/llvm/include/llvm/CodeGen/TargetLowering.h (revision 344055) +++ head/contrib/llvm/include/llvm/CodeGen/TargetLowering.h (revision 344056) @@ -1,3700 +1,3704 @@ //===- llvm/CodeGen/TargetLowering.h - Target Lowering Info -----*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// /// /// \file /// This file describes how to lower LLVM code to machine code. This has two /// main components: /// /// 1. Which ValueTypes are natively supported by the target. /// 2. Which operations are supported for supported ValueTypes. /// 3. Cost thresholds for alternative implementations of certain operations. /// /// In addition it has a few other components, like information about FP /// immediates. /// //===----------------------------------------------------------------------===// #ifndef LLVM_CODEGEN_TARGETLOWERING_H #define LLVM_CODEGEN_TARGETLOWERING_H #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/DivergenceAnalysis.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Target/TargetMachine.h" #include #include #include #include #include #include #include #include #include namespace llvm { class BranchProbability; class CCState; class CCValAssign; class Constant; class FastISel; class FunctionLoweringInfo; class GlobalValue; class IntrinsicInst; struct KnownBits; class LLVMContext; class MachineBasicBlock; class MachineFunction; class MachineInstr; class MachineJumpTableInfo; class MachineLoop; class MachineRegisterInfo; class MCContext; class MCExpr; class Module; class TargetRegisterClass; class TargetLibraryInfo; class TargetRegisterInfo; class Value; namespace Sched { enum Preference { None, // No preference Source, // Follow source order. RegPressure, // Scheduling for lowest register pressure. Hybrid, // Scheduling for both latency and register pressure. ILP, // Scheduling for ILP in low register pressure mode. VLIW // Scheduling for VLIW targets. }; } // end namespace Sched /// This base class for TargetLowering contains the SelectionDAG-independent /// parts that can be used from the rest of CodeGen. class TargetLoweringBase { public: /// This enum indicates whether operations are valid for a target, and if not, /// what action should be used to make them valid. enum LegalizeAction : uint8_t { Legal, // The target natively supports this operation. Promote, // This operation should be executed in a larger type. Expand, // Try to expand this to other ops, otherwise use a libcall. LibCall, // Don't try to expand this to other ops, always use a libcall. Custom // Use the LowerOperation hook to implement custom lowering. }; /// This enum indicates whether a types are legal for a target, and if not, /// what action should be used to make them valid. enum LegalizeTypeAction : uint8_t { TypeLegal, // The target natively supports this type. TypePromoteInteger, // Replace this integer with a larger one. TypeExpandInteger, // Split this integer into two of half the size. TypeSoftenFloat, // Convert this float to a same size integer type, // if an operation is not supported in target HW. TypeExpandFloat, // Split this float into two of half the size. TypeScalarizeVector, // Replace this one-element vector with its element. TypeSplitVector, // Split this vector into two of half the size. TypeWidenVector, // This vector should be widened into a larger vector. TypePromoteFloat // Replace this float with a larger one. }; /// LegalizeKind holds the legalization kind that needs to happen to EVT /// in order to type-legalize it. using LegalizeKind = std::pair; /// Enum that describes how the target represents true/false values. enum BooleanContent { UndefinedBooleanContent, // Only bit 0 counts, the rest can hold garbage. ZeroOrOneBooleanContent, // All bits zero except for bit 0. ZeroOrNegativeOneBooleanContent // All bits equal to bit 0. }; /// Enum that describes what type of support for selects the target has. enum SelectSupportKind { ScalarValSelect, // The target supports scalar selects (ex: cmov). ScalarCondVectorVal, // The target supports selects with a scalar condition // and vector values (ex: cmov). VectorMaskSelect // The target supports vector selects with a vector // mask (ex: x86 blends). }; /// Enum that specifies what an atomic load/AtomicRMWInst is expanded /// to, if at all. Exists because different targets have different levels of /// support for these atomic instructions, and also have different options /// w.r.t. what they should expand to. enum class AtomicExpansionKind { None, // Don't expand the instruction. LLSC, // Expand the instruction into loadlinked/storeconditional; used // by ARM/AArch64. LLOnly, // Expand the (load) instruction into just a load-linked, which has // greater atomic guarantees than a normal load. CmpXChg, // Expand the instruction into cmpxchg; used by at least X86. }; /// Enum that specifies when a multiplication should be expanded. enum class MulExpansionKind { Always, // Always expand the instruction. OnlyLegalOrCustom, // Only expand when the resulting instructions are legal // or custom. }; class ArgListEntry { public: Value *Val = nullptr; SDValue Node = SDValue(); Type *Ty = nullptr; bool IsSExt : 1; bool IsZExt : 1; bool IsInReg : 1; bool IsSRet : 1; bool IsNest : 1; bool IsByVal : 1; bool IsInAlloca : 1; bool IsReturned : 1; bool IsSwiftSelf : 1; bool IsSwiftError : 1; uint16_t Alignment = 0; ArgListEntry() : IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false), IsNest(false), IsByVal(false), IsInAlloca(false), IsReturned(false), IsSwiftSelf(false), IsSwiftError(false) {} void setAttributes(ImmutableCallSite *CS, unsigned ArgIdx); }; using ArgListTy = std::vector; virtual void markLibCallAttributes(MachineFunction *MF, unsigned CC, ArgListTy &Args) const {}; static ISD::NodeType getExtendForContent(BooleanContent Content) { switch (Content) { case UndefinedBooleanContent: // Extend by adding rubbish bits. return ISD::ANY_EXTEND; case ZeroOrOneBooleanContent: // Extend by adding zero bits. return ISD::ZERO_EXTEND; case ZeroOrNegativeOneBooleanContent: // Extend by copying the sign bit. return ISD::SIGN_EXTEND; } llvm_unreachable("Invalid content kind"); } /// NOTE: The TargetMachine owns TLOF. explicit TargetLoweringBase(const TargetMachine &TM); TargetLoweringBase(const TargetLoweringBase &) = delete; TargetLoweringBase &operator=(const TargetLoweringBase &) = delete; virtual ~TargetLoweringBase() = default; protected: /// Initialize all of the actions to default values. void initActions(); public: const TargetMachine &getTargetMachine() const { return TM; } virtual bool useSoftFloat() const { return false; } /// Return the pointer type for the given address space, defaults to /// the pointer type from the data layout. /// FIXME: The default needs to be removed once all the code is updated. MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const { return MVT::getIntegerVT(DL.getPointerSizeInBits(AS)); } /// Return the type for frame index, which is determined by /// the alloca address space specified through the data layout. MVT getFrameIndexTy(const DataLayout &DL) const { return getPointerTy(DL, DL.getAllocaAddrSpace()); } /// Return the type for operands of fence. /// TODO: Let fence operands be of i32 type and remove this. virtual MVT getFenceOperandTy(const DataLayout &DL) const { return getPointerTy(DL); } /// EVT is not used in-tree, but is used by out-of-tree target. /// A documentation for this function would be nice... virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const; EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL, bool LegalTypes = true) const; /// Returns the type to be used for the index operand of: /// ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT, /// ISD::INSERT_SUBVECTOR, and ISD::EXTRACT_SUBVECTOR virtual MVT getVectorIdxTy(const DataLayout &DL) const { return getPointerTy(DL); } virtual bool isSelectSupported(SelectSupportKind /*kind*/) const { return true; } /// Return true if multiple condition registers are available. bool hasMultipleConditionRegisters() const { return HasMultipleConditionRegisters; } /// Return true if the target has BitExtract instructions. bool hasExtractBitsInsn() const { return HasExtractBitsInsn; } /// Return the preferred vector type legalization action. virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const { // The default action for one element vectors is to scalarize if (VT.getVectorNumElements() == 1) return TypeScalarizeVector; // The default action for other vectors is to promote return TypePromoteInteger; } // There are two general methods for expanding a BUILD_VECTOR node: // 1. Use SCALAR_TO_VECTOR on the defined scalar values and then shuffle // them together. // 2. Build the vector on the stack and then load it. // If this function returns true, then method (1) will be used, subject to // the constraint that all of the necessary shuffles are legal (as determined // by isShuffleMaskLegal). If this function returns false, then method (2) is // always used. The vector type, and the number of defined values, are // provided. virtual bool shouldExpandBuildVectorWithShuffles(EVT /* VT */, unsigned DefinedValues) const { return DefinedValues < 3; } /// Return true if integer divide is usually cheaper than a sequence of /// several shifts, adds, and multiplies for this target. /// The definition of "cheaper" may depend on whether we're optimizing /// for speed or for size. virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const { return false; } /// Return true if the target can handle a standalone remainder operation. virtual bool hasStandaloneRem(EVT VT) const { return true; } /// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X). virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const { // Default behavior is to replace SQRT(X) with X*RSQRT(X). return false; } /// Reciprocal estimate status values used by the functions below. enum ReciprocalEstimate : int { Unspecified = -1, Disabled = 0, Enabled = 1 }; /// Return a ReciprocalEstimate enum value for a square root of the given type /// based on the function's attributes. If the operation is not overridden by /// the function's attributes, "Unspecified" is returned and target defaults /// are expected to be used for instruction selection. int getRecipEstimateSqrtEnabled(EVT VT, MachineFunction &MF) const; /// Return a ReciprocalEstimate enum value for a division of the given type /// based on the function's attributes. If the operation is not overridden by /// the function's attributes, "Unspecified" is returned and target defaults /// are expected to be used for instruction selection. int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF) const; /// Return the refinement step count for a square root of the given type based /// on the function's attributes. If the operation is not overridden by /// the function's attributes, "Unspecified" is returned and target defaults /// are expected to be used for instruction selection. int getSqrtRefinementSteps(EVT VT, MachineFunction &MF) const; /// Return the refinement step count for a division of the given type based /// on the function's attributes. If the operation is not overridden by /// the function's attributes, "Unspecified" is returned and target defaults /// are expected to be used for instruction selection. int getDivRefinementSteps(EVT VT, MachineFunction &MF) const; /// Returns true if target has indicated at least one type should be bypassed. bool isSlowDivBypassed() const { return !BypassSlowDivWidths.empty(); } /// Returns map of slow types for division or remainder with corresponding /// fast types const DenseMap &getBypassSlowDivWidths() const { return BypassSlowDivWidths; } /// Return true if Flow Control is an expensive operation that should be /// avoided. bool isJumpExpensive() const { return JumpIsExpensive; } /// Return true if selects are only cheaper than branches if the branch is /// unlikely to be predicted right. bool isPredictableSelectExpensive() const { return PredictableSelectIsExpensive; } /// If a branch or a select condition is skewed in one direction by more than /// this factor, it is very likely to be predicted correctly. virtual BranchProbability getPredictableBranchThreshold() const; /// Return true if the following transform is beneficial: /// fold (conv (load x)) -> (load (conv*)x) /// On architectures that don't natively support some vector loads /// efficiently, casting the load to a smaller vector of larger types and /// loading is more efficient, however, this can be undone by optimizations in /// dag combiner. virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const { // Don't do if we could do an indexed load on the original type, but not on // the new one. if (!LoadVT.isSimple() || !BitcastVT.isSimple()) return true; MVT LoadMVT = LoadVT.getSimpleVT(); // Don't bother doing this if it's just going to be promoted again later, as // doing so might interfere with other combines. if (getOperationAction(ISD::LOAD, LoadMVT) == Promote && getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT()) return false; return true; } /// Return true if the following transform is beneficial: /// (store (y (conv x)), y*)) -> (store x, (x*)) virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT) const { // Default to the same logic as loads. return isLoadBitCastBeneficial(StoreVT, BitcastVT); } /// Return true if it is expected to be cheaper to do a store of a non-zero /// vector constant with the given size and type for the address space than to /// store the individual scalar element constants. virtual bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, unsigned AddrSpace) const { return false; } /// Allow store merging after legalization in addition to before legalization. /// This may catch stores that do not exist earlier (eg, stores created from /// intrinsics). virtual bool mergeStoresAfterLegalization() const { return true; } /// Returns if it's reasonable to merge stores to MemVT size. virtual bool canMergeStoresTo(unsigned AS, EVT MemVT, const SelectionDAG &DAG) const { return true; } /// Return true if it is cheap to speculate a call to intrinsic cttz. virtual bool isCheapToSpeculateCttz() const { return false; } /// Return true if it is cheap to speculate a call to intrinsic ctlz. virtual bool isCheapToSpeculateCtlz() const { return false; } /// Return true if ctlz instruction is fast. virtual bool isCtlzFast() const { return false; } /// Return true if it is safe to transform an integer-domain bitwise operation /// into the equivalent floating-point operation. This should be set to true /// if the target has IEEE-754-compliant fabs/fneg operations for the input /// type. virtual bool hasBitPreservingFPLogic(EVT VT) const { return false; } /// Return true if it is cheaper to split the store of a merged int val /// from a pair of smaller values into multiple stores. virtual bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const { return false; } /// Return if the target supports combining a /// chain like: /// \code /// %andResult = and %val1, #mask /// %icmpResult = icmp %andResult, 0 /// \endcode /// into a single machine instruction of a form like: /// \code /// cc = test %register, #mask /// \endcode virtual bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const { return false; } /// Use bitwise logic to make pairs of compares more efficient. For example: /// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 /// This should be true when it takes more than one instruction to lower /// setcc (cmp+set on x86 scalar), when bitwise ops are faster than logic on /// condition bits (crand on PowerPC), and/or when reducing cmp+br is a win. virtual bool convertSetCCLogicToBitwiseLogic(EVT VT) const { return false; } /// Return the preferred operand type if the target has a quick way to compare /// integer values of the given size. Assume that any legal integer type can /// be compared efficiently. Targets may override this to allow illegal wide /// types to return a vector type if there is support to compare that type. virtual MVT hasFastEqualityCompare(unsigned NumBits) const { MVT VT = MVT::getIntegerVT(NumBits); return isTypeLegal(VT) ? VT : MVT::INVALID_SIMPLE_VALUE_TYPE; } /// Return true if the target should transform: /// (X & Y) == Y ---> (~X & Y) == 0 /// (X & Y) != Y ---> (~X & Y) != 0 /// /// This may be profitable if the target has a bitwise and-not operation that /// sets comparison flags. A target may want to limit the transformation based /// on the type of Y or if Y is a constant. /// /// Note that the transform will not occur if Y is known to be a power-of-2 /// because a mask and compare of a single bit can be handled by inverting the /// predicate, for example: /// (X & 8) == 8 ---> (X & 8) != 0 virtual bool hasAndNotCompare(SDValue Y) const { return false; } /// Return true if the target has a bitwise and-not operation: /// X = ~A & B /// This can be used to simplify select or other instructions. virtual bool hasAndNot(SDValue X) const { // If the target has the more complex version of this operation, assume that // it has this operation too. return hasAndNotCompare(X); } /// There are two ways to clear extreme bits (either low or high): /// Mask: x & (-1 << y) (the instcombine canonical form) /// Shifts: x >> y << y /// Return true if the variant with 2 shifts is preferred. /// Return false if there is no preference. virtual bool preferShiftsToClearExtremeBits(SDValue X) const { // By default, let's assume that no one prefers shifts. return false; } /// Should we tranform the IR-optimal check for whether given truncation /// down into KeptBits would be truncating or not: /// (add %x, (1 << (KeptBits-1))) srccond (1 << KeptBits) /// Into it's more traditional form: /// ((%x << C) a>> C) dstcond %x /// Return true if we should transform. /// Return false if there is no preference. virtual bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const { // By default, let's assume that no one prefers shifts. return false; } /// Return true if the target wants to use the optimization that /// turns ext(promotableInst1(...(promotableInstN(load)))) into /// promotedInst1(...(promotedInstN(ext(load)))). bool enableExtLdPromotion() const { return EnableExtLdPromotion; } /// Return true if the target can combine store(extractelement VectorTy, /// Idx). /// \p Cost[out] gives the cost of that transformation when this is true. virtual bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const { return false; } /// Return true if target supports floating point exceptions. bool hasFloatingPointExceptions() const { return HasFloatingPointExceptions; } /// Return true if target always beneficiates from combining into FMA for a /// given value type. This must typically return false on targets where FMA /// takes more cycles to execute than FADD. virtual bool enableAggressiveFMAFusion(EVT VT) const { return false; } /// Return the ValueType of the result of SETCC operations. virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const; /// Return the ValueType for comparison libcalls. Comparions libcalls include /// floating point comparion calls, and Ordered/Unordered check calls on /// floating point numbers. virtual MVT::SimpleValueType getCmpLibcallReturnType() const; /// For targets without i1 registers, this gives the nature of the high-bits /// of boolean values held in types wider than i1. /// /// "Boolean values" are special true/false values produced by nodes like /// SETCC and consumed (as the condition) by nodes like SELECT and BRCOND. /// Not to be confused with general values promoted from i1. Some cpus /// distinguish between vectors of boolean and scalars; the isVec parameter /// selects between the two kinds. For example on X86 a scalar boolean should /// be zero extended from i1, while the elements of a vector of booleans /// should be sign extended from i1. /// /// Some cpus also treat floating point types the same way as they treat /// vectors instead of the way they treat scalars. BooleanContent getBooleanContents(bool isVec, bool isFloat) const { if (isVec) return BooleanVectorContents; return isFloat ? BooleanFloatContents : BooleanContents; } BooleanContent getBooleanContents(EVT Type) const { return getBooleanContents(Type.isVector(), Type.isFloatingPoint()); } /// Return target scheduling preference. Sched::Preference getSchedulingPreference() const { return SchedPreferenceInfo; } /// Some scheduler, e.g. hybrid, can switch to different scheduling heuristics /// for different nodes. This function returns the preference (or none) for /// the given node. virtual Sched::Preference getSchedulingPreference(SDNode *) const { return Sched::None; } /// Return the register class that should be used for the specified value /// type. virtual const TargetRegisterClass *getRegClassFor(MVT VT) const { const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy]; assert(RC && "This value type is not natively supported!"); return RC; } /// Return the 'representative' register class for the specified value /// type. /// /// The 'representative' register class is the largest legal super-reg /// register class for the register class of the value type. For example, on /// i386 the rep register class for i8, i16, and i32 are GR32; while the rep /// register class is GR64 on x86_64. virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const { const TargetRegisterClass *RC = RepRegClassForVT[VT.SimpleTy]; return RC; } /// Return the cost of the 'representative' register class for the specified /// value type. virtual uint8_t getRepRegClassCostFor(MVT VT) const { return RepRegClassCostForVT[VT.SimpleTy]; } /// Return true if the target has native support for the specified value type. /// This means that it has a register that directly holds it without /// promotions or expansions. bool isTypeLegal(EVT VT) const { assert(!VT.isSimple() || (unsigned)VT.getSimpleVT().SimpleTy < array_lengthof(RegClassForVT)); return VT.isSimple() && RegClassForVT[VT.getSimpleVT().SimpleTy] != nullptr; } class ValueTypeActionImpl { /// ValueTypeActions - For each value type, keep a LegalizeTypeAction enum /// that indicates how instruction selection should deal with the type. LegalizeTypeAction ValueTypeActions[MVT::LAST_VALUETYPE]; public: ValueTypeActionImpl() { std::fill(std::begin(ValueTypeActions), std::end(ValueTypeActions), TypeLegal); } LegalizeTypeAction getTypeAction(MVT VT) const { return ValueTypeActions[VT.SimpleTy]; } void setTypeAction(MVT VT, LegalizeTypeAction Action) { ValueTypeActions[VT.SimpleTy] = Action; } }; const ValueTypeActionImpl &getValueTypeActions() const { return ValueTypeActions; } /// Return how we should legalize values of this type, either it is already /// legal (return 'Legal') or we need to promote it to a larger type (return /// 'Promote'), or we need to expand it into multiple registers of smaller /// integer type (return 'Expand'). 'Custom' is not an option. LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const { return getTypeConversion(Context, VT).first; } LegalizeTypeAction getTypeAction(MVT VT) const { return ValueTypeActions.getTypeAction(VT); } /// For types supported by the target, this is an identity function. For /// types that must be promoted to larger types, this returns the larger type /// to promote to. For integer types that are larger than the largest integer /// register, this contains one step in the expansion to get to the smaller /// register. For illegal floating point types, this returns the integer type /// to transform to. EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const { return getTypeConversion(Context, VT).second; } /// For types supported by the target, this is an identity function. For /// types that must be expanded (i.e. integer types that are larger than the /// largest integer register or illegal floating point types), this returns /// the largest legal type it will be expanded to. EVT getTypeToExpandTo(LLVMContext &Context, EVT VT) const { assert(!VT.isVector()); while (true) { switch (getTypeAction(Context, VT)) { case TypeLegal: return VT; case TypeExpandInteger: VT = getTypeToTransformTo(Context, VT); break; default: llvm_unreachable("Type is not legal nor is it to be expanded!"); } } } /// Vector types are broken down into some number of legal first class types. /// For example, EVT::v8f32 maps to 2 EVT::v4f32 with Altivec or SSE1, or 8 /// promoted EVT::f64 values with the X86 FP stack. Similarly, EVT::v2i64 /// turns into 4 EVT::i32 values with both PPC and X86. /// /// This method returns the number of registers needed, and the VT for each /// register. It also returns the VT and quantity of the intermediate values /// before they are promoted/expanded. unsigned getVectorTypeBreakdown(LLVMContext &Context, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const; /// Certain targets such as MIPS require that some types such as vectors are /// always broken down into scalars in some contexts. This occurs even if the /// vector type is legal. virtual unsigned getVectorTypeBreakdownForCallingConv( LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const { return getVectorTypeBreakdown(Context, VT, IntermediateVT, NumIntermediates, RegisterVT); } struct IntrinsicInfo { unsigned opc = 0; // target opcode EVT memVT; // memory VT // value representing memory location PointerUnion ptrVal; int offset = 0; // offset off of ptrVal unsigned size = 0; // the size of the memory location // (taken from memVT if zero) unsigned align = 1; // alignment MachineMemOperand::Flags flags = MachineMemOperand::MONone; IntrinsicInfo() = default; }; /// Given an intrinsic, checks if on the target the intrinsic will need to map /// to a MemIntrinsicNode (touches memory). If this is the case, it returns /// true and store the intrinsic information into the IntrinsicInfo that was /// passed to the function. virtual bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &, unsigned /*Intrinsic*/) const { return false; } /// Returns true if the target can instruction select the specified FP /// immediate natively. If false, the legalizer will materialize the FP /// immediate as a load from a constant pool. virtual bool isFPImmLegal(const APFloat &/*Imm*/, EVT /*VT*/) const { return false; } /// Targets can use this to indicate that they only support *some* /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to be /// legal. virtual bool isShuffleMaskLegal(ArrayRef /*Mask*/, EVT /*VT*/) const { return true; } /// Returns true if the operation can trap for the value type. /// /// VT must be a legal type. By default, we optimistically assume most /// operations don't trap except for integer divide and remainder. virtual bool canOpTrap(unsigned Op, EVT VT) const; /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a /// constant pool entry. virtual bool isVectorClearMaskLegal(ArrayRef /*Mask*/, EVT /*VT*/) const { return false; } /// Return how this operation should be treated: either it is legal, needs to /// be promoted to a larger size, needs to be expanded to some other code /// sequence, or the target has a custom expander for it. LegalizeAction getOperationAction(unsigned Op, EVT VT) const { if (VT.isExtended()) return Expand; // If a target-specific SDNode requires legalization, require the target // to provide custom legalization for it. if (Op >= array_lengthof(OpActions[0])) return Custom; return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op]; } LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const { unsigned EqOpc; switch (Op) { default: llvm_unreachable("Unexpected FP pseudo-opcode"); case ISD::STRICT_FADD: EqOpc = ISD::FADD; break; case ISD::STRICT_FSUB: EqOpc = ISD::FSUB; break; case ISD::STRICT_FMUL: EqOpc = ISD::FMUL; break; case ISD::STRICT_FDIV: EqOpc = ISD::FDIV; break; case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break; case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break; case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break; case ISD::STRICT_FMA: EqOpc = ISD::FMA; break; case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break; case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break; case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break; case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break; case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break; case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break; case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break; case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break; case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break; } auto Action = getOperationAction(EqOpc, VT); // We don't currently handle Custom or Promote for strict FP pseudo-ops. // For now, we just expand for those cases. if (Action != Legal) Action = Expand; return Action; } /// Return true if the specified operation is legal on this target or can be /// made legal with custom lowering. This is used to help guide high-level /// lowering decisions. bool isOperationLegalOrCustom(unsigned Op, EVT VT) const { return (VT == MVT::Other || isTypeLegal(VT)) && (getOperationAction(Op, VT) == Legal || getOperationAction(Op, VT) == Custom); } /// Return true if the specified operation is legal on this target or can be /// made legal using promotion. This is used to help guide high-level lowering /// decisions. bool isOperationLegalOrPromote(unsigned Op, EVT VT) const { return (VT == MVT::Other || isTypeLegal(VT)) && (getOperationAction(Op, VT) == Legal || getOperationAction(Op, VT) == Promote); } /// Return true if the specified operation is legal on this target or can be /// made legal with custom lowering or using promotion. This is used to help /// guide high-level lowering decisions. bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT) const { return (VT == MVT::Other || isTypeLegal(VT)) && (getOperationAction(Op, VT) == Legal || getOperationAction(Op, VT) == Custom || getOperationAction(Op, VT) == Promote); } /// Return true if the operation uses custom lowering, regardless of whether /// the type is legal or not. bool isOperationCustom(unsigned Op, EVT VT) const { return getOperationAction(Op, VT) == Custom; } /// Return true if lowering to a jump table is allowed. virtual bool areJTsAllowed(const Function *Fn) const { if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true") return false; return isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || isOperationLegalOrCustom(ISD::BRIND, MVT::Other); } /// Check whether the range [Low,High] fits in a machine word. bool rangeFitsInWord(const APInt &Low, const APInt &High, const DataLayout &DL) const { // FIXME: Using the pointer type doesn't seem ideal. uint64_t BW = DL.getIndexSizeInBits(0u); uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1; return Range <= BW; } /// Return true if lowering to a jump table is suitable for a set of case /// clusters which may contain \p NumCases cases, \p Range range of values. /// FIXME: This function check the maximum table size and density, but the /// minimum size is not checked. It would be nice if the minimum size is /// also combined within this function. Currently, the minimum size check is /// performed in findJumpTable() in SelectionDAGBuiler and /// getEstimatedNumberOfCaseClusters() in BasicTTIImpl. virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range) const { const bool OptForSize = SI->getParent()->getParent()->optForSize(); const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize); const unsigned MaxJumpTableSize = OptForSize || getMaximumJumpTableSize() == 0 ? UINT_MAX : getMaximumJumpTableSize(); // Check whether a range of clusters is dense enough for a jump table. if (Range <= MaxJumpTableSize && (NumCases * 100 >= Range * MinDensity)) { return true; } return false; } /// Return true if lowering to a bit test is suitable for a set of case /// clusters which contains \p NumDests unique destinations, \p Low and /// \p High as its lowest and highest case values, and expects \p NumCmps /// case value comparisons. Check if the number of destinations, comparison /// metric, and range are all suitable. bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const { // FIXME: I don't think NumCmps is the correct metric: a single case and a // range of cases both require only one branch to lower. Just looking at the // number of clusters and destinations should be enough to decide whether to // build bit tests. // To lower a range with bit tests, the range must fit the bitwidth of a // machine word. if (!rangeFitsInWord(Low, High, DL)) return false; // Decide whether it's profitable to lower this range with bit tests. Each // destination requires a bit test and branch, and there is an overall range // check branch. For a small number of clusters, separate comparisons might // be cheaper, and for many destinations, splitting the range might be // better. return (NumDests == 1 && NumCmps >= 3) || (NumDests == 2 && NumCmps >= 5) || (NumDests == 3 && NumCmps >= 6); } /// Return true if the specified operation is illegal on this target or /// unlikely to be made legal with custom lowering. This is used to help guide /// high-level lowering decisions. bool isOperationExpand(unsigned Op, EVT VT) const { return (!isTypeLegal(VT) || getOperationAction(Op, VT) == Expand); } /// Return true if the specified operation is legal on this target. bool isOperationLegal(unsigned Op, EVT VT) const { return (VT == MVT::Other || isTypeLegal(VT)) && getOperationAction(Op, VT) == Legal; } /// Return how this load with extension should be treated: either it is legal, /// needs to be promoted to a larger size, needs to be expanded to some other /// code sequence, or the target has a custom expander for it. LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const { if (ValVT.isExtended() || MemVT.isExtended()) return Expand; unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy; unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy; assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!"); unsigned Shift = 4 * ExtType; return (LegalizeAction)((LoadExtActions[ValI][MemI] >> Shift) & 0xf); } /// Return true if the specified load with extension is legal on this target. bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const { return getLoadExtAction(ExtType, ValVT, MemVT) == Legal; } /// Return true if the specified load with extension is legal or custom /// on this target. bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const { return getLoadExtAction(ExtType, ValVT, MemVT) == Legal || getLoadExtAction(ExtType, ValVT, MemVT) == Custom; } /// Return how this store with truncation should be treated: either it is /// legal, needs to be promoted to a larger size, needs to be expanded to some /// other code sequence, or the target has a custom expander for it. LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const { if (ValVT.isExtended() || MemVT.isExtended()) return Expand; unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy; unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy; assert(ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!"); return TruncStoreActions[ValI][MemI]; } /// Return true if the specified store with truncation is legal on this /// target. bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const { return isTypeLegal(ValVT) && getTruncStoreAction(ValVT, MemVT) == Legal; } /// Return true if the specified store with truncation has solution on this /// target. bool isTruncStoreLegalOrCustom(EVT ValVT, EVT MemVT) const { return isTypeLegal(ValVT) && (getTruncStoreAction(ValVT, MemVT) == Legal || getTruncStoreAction(ValVT, MemVT) == Custom); } /// Return how the indexed load should be treated: either it is legal, needs /// to be promoted to a larger size, needs to be expanded to some other code /// sequence, or the target has a custom expander for it. LegalizeAction getIndexedLoadAction(unsigned IdxMode, MVT VT) const { assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() && "Table isn't big enough!"); unsigned Ty = (unsigned)VT.SimpleTy; return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] & 0xf0) >> 4); } /// Return true if the specified indexed load is legal on this target. bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const { return VT.isSimple() && (getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Legal || getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Custom); } /// Return how the indexed store should be treated: either it is legal, needs /// to be promoted to a larger size, needs to be expanded to some other code /// sequence, or the target has a custom expander for it. LegalizeAction getIndexedStoreAction(unsigned IdxMode, MVT VT) const { assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() && "Table isn't big enough!"); unsigned Ty = (unsigned)VT.SimpleTy; return (LegalizeAction)(IndexedModeActions[Ty][IdxMode] & 0x0f); } /// Return true if the specified indexed load is legal on this target. bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const { return VT.isSimple() && (getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Legal || getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Custom); } /// Return how the condition code should be treated: either it is legal, needs /// to be expanded to some other code sequence, or the target has a custom /// expander for it. LegalizeAction getCondCodeAction(ISD::CondCode CC, MVT VT) const { assert((unsigned)CC < array_lengthof(CondCodeActions) && ((unsigned)VT.SimpleTy >> 3) < array_lengthof(CondCodeActions[0]) && "Table isn't big enough!"); // See setCondCodeAction for how this is encoded. uint32_t Shift = 4 * (VT.SimpleTy & 0x7); uint32_t Value = CondCodeActions[CC][VT.SimpleTy >> 3]; LegalizeAction Action = (LegalizeAction) ((Value >> Shift) & 0xF); assert(Action != Promote && "Can't promote condition code!"); return Action; } /// Return true if the specified condition code is legal on this target. bool isCondCodeLegal(ISD::CondCode CC, MVT VT) const { return getCondCodeAction(CC, VT) == Legal; } /// Return true if the specified condition code is legal or custom on this /// target. bool isCondCodeLegalOrCustom(ISD::CondCode CC, MVT VT) const { return getCondCodeAction(CC, VT) == Legal || getCondCodeAction(CC, VT) == Custom; } /// If the action for this operation is to promote, this method returns the /// ValueType to promote to. MVT getTypeToPromoteTo(unsigned Op, MVT VT) const { assert(getOperationAction(Op, VT) == Promote && "This operation isn't promoted!"); // See if this has an explicit type specified. std::map, MVT::SimpleValueType>::const_iterator PTTI = PromoteToType.find(std::make_pair(Op, VT.SimpleTy)); if (PTTI != PromoteToType.end()) return PTTI->second; assert((VT.isInteger() || VT.isFloatingPoint()) && "Cannot autopromote this type, add it with AddPromotedToType."); MVT NVT = VT; do { NVT = (MVT::SimpleValueType)(NVT.SimpleTy+1); assert(NVT.isInteger() == VT.isInteger() && NVT != MVT::isVoid && "Didn't find type to promote to!"); } while (!isTypeLegal(NVT) || getOperationAction(Op, NVT) == Promote); return NVT; } /// Return the EVT corresponding to this LLVM type. This is fixed by the LLVM /// operations except for the pointer size. If AllowUnknown is true, this /// will return MVT::Other for types with no EVT counterpart (e.g. structs), /// otherwise it will assert. EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown = false) const { // Lower scalar pointers to native pointer types. if (PointerType *PTy = dyn_cast(Ty)) return getPointerTy(DL, PTy->getAddressSpace()); if (Ty->isVectorTy()) { VectorType *VTy = cast(Ty); Type *Elm = VTy->getElementType(); // Lower vectors of pointers to native pointer types. if (PointerType *PT = dyn_cast(Elm)) { EVT PointerTy(getPointerTy(DL, PT->getAddressSpace())); Elm = PointerTy.getTypeForEVT(Ty->getContext()); } return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(Elm, false), VTy->getNumElements()); } return EVT::getEVT(Ty, AllowUnknown); } /// Return the MVT corresponding to this LLVM type. See getValueType. MVT getSimpleValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown = false) const { return getValueType(DL, Ty, AllowUnknown).getSimpleVT(); } /// Return the desired alignment for ByVal or InAlloca aggregate function /// arguments in the caller parameter area. This is the actual alignment, not /// its logarithm. virtual unsigned getByValTypeAlignment(Type *Ty, const DataLayout &DL) const; /// Return the type of registers that this ValueType will eventually require. MVT getRegisterType(MVT VT) const { assert((unsigned)VT.SimpleTy < array_lengthof(RegisterTypeForVT)); return RegisterTypeForVT[VT.SimpleTy]; } /// Return the type of registers that this ValueType will eventually require. MVT getRegisterType(LLVMContext &Context, EVT VT) const { if (VT.isSimple()) { assert((unsigned)VT.getSimpleVT().SimpleTy < array_lengthof(RegisterTypeForVT)); return RegisterTypeForVT[VT.getSimpleVT().SimpleTy]; } if (VT.isVector()) { EVT VT1; MVT RegisterVT; unsigned NumIntermediates; (void)getVectorTypeBreakdown(Context, VT, VT1, NumIntermediates, RegisterVT); return RegisterVT; } if (VT.isInteger()) { return getRegisterType(Context, getTypeToTransformTo(Context, VT)); } llvm_unreachable("Unsupported extended type!"); } /// Return the number of registers that this ValueType will eventually /// require. /// /// This is one for any types promoted to live in larger registers, but may be /// more than one for types (like i64) that are split into pieces. For types /// like i140, which are first promoted then expanded, it is the number of /// registers needed to hold all the bits of the original type. For an i140 /// on a 32 bit machine this means 5 registers. unsigned getNumRegisters(LLVMContext &Context, EVT VT) const { if (VT.isSimple()) { assert((unsigned)VT.getSimpleVT().SimpleTy < array_lengthof(NumRegistersForVT)); return NumRegistersForVT[VT.getSimpleVT().SimpleTy]; } if (VT.isVector()) { EVT VT1; MVT VT2; unsigned NumIntermediates; return getVectorTypeBreakdown(Context, VT, VT1, NumIntermediates, VT2); } if (VT.isInteger()) { unsigned BitWidth = VT.getSizeInBits(); unsigned RegWidth = getRegisterType(Context, VT).getSizeInBits(); return (BitWidth + RegWidth - 1) / RegWidth; } llvm_unreachable("Unsupported extended type!"); } /// Certain combinations of ABIs, Targets and features require that types /// are legal for some operations and not for other operations. /// For MIPS all vector types must be passed through the integer register set. virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { return getRegisterType(Context, VT); } /// Certain targets require unusual breakdowns of certain types. For MIPS, /// this occurs when a vector type is used, as vector are passed through the /// integer register set. virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { return getNumRegisters(Context, VT); } /// Certain targets have context senstive alignment requirements, where one /// type has the alignment requirement of another type. virtual unsigned getABIAlignmentForCallingConv(Type *ArgTy, DataLayout DL) const { return DL.getABITypeAlignment(ArgTy); } /// If true, then instruction selection should seek to shrink the FP constant /// of the specified type to a smaller type in order to save space and / or /// reduce runtime. virtual bool ShouldShrinkFPConstant(EVT) const { return true; } // Return true if it is profitable to reduce the given load node to a smaller // type. // // e.g. (i16 (trunc (i32 (load x))) -> i16 load x should be performed virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { return true; } /// When splitting a value of the specified type into parts, does the Lo /// or Hi part come first? This usually follows the endianness, except /// for ppcf128, where the Hi part always comes first. bool hasBigEndianPartOrdering(EVT VT, const DataLayout &DL) const { return DL.isBigEndian() || VT == MVT::ppcf128; } /// If true, the target has custom DAG combine transformations that it can /// perform for the specified node. bool hasTargetDAGCombine(ISD::NodeType NT) const { assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray)); return TargetDAGCombineArray[NT >> 3] & (1 << (NT&7)); } unsigned getGatherAllAliasesMaxDepth() const { return GatherAllAliasesMaxDepth; } /// Returns the size of the platform's va_list object. virtual unsigned getVaListSizeInBits(const DataLayout &DL) const { return getPointerTy(DL).getSizeInBits(); } /// Get maximum # of store operations permitted for llvm.memset /// /// This function returns the maximum number of store operations permitted /// to replace a call to llvm.memset. The value is set by the target at the /// performance threshold for such a replacement. If OptSize is true, /// return the limit for functions that have OptSize attribute. unsigned getMaxStoresPerMemset(bool OptSize) const { return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset; } /// Get maximum # of store operations permitted for llvm.memcpy /// /// This function returns the maximum number of store operations permitted /// to replace a call to llvm.memcpy. The value is set by the target at the /// performance threshold for such a replacement. If OptSize is true, /// return the limit for functions that have OptSize attribute. unsigned getMaxStoresPerMemcpy(bool OptSize) const { return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy; } /// \brief Get maximum # of store operations to be glued together /// /// This function returns the maximum number of store operations permitted /// to glue together during lowering of llvm.memcpy. The value is set by // the target at the performance threshold for such a replacement. virtual unsigned getMaxGluedStoresPerMemcpy() const { return MaxGluedStoresPerMemcpy; } /// Get maximum # of load operations permitted for memcmp /// /// This function returns the maximum number of load operations permitted /// to replace a call to memcmp. The value is set by the target at the /// performance threshold for such a replacement. If OptSize is true, /// return the limit for functions that have OptSize attribute. unsigned getMaxExpandSizeMemcmp(bool OptSize) const { return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp; } /// For memcmp expansion when the memcmp result is only compared equal or /// not-equal to 0, allow up to this number of load pairs per block. As an /// example, this may allow 'memcmp(a, b, 3) == 0' in a single block: /// a0 = load2bytes &a[0] /// b0 = load2bytes &b[0] /// a2 = load1byte &a[2] /// b2 = load1byte &b[2] /// r = cmp eq (a0 ^ b0 | a2 ^ b2), 0 virtual unsigned getMemcmpEqZeroLoadsPerBlock() const { return 1; } /// Get maximum # of store operations permitted for llvm.memmove /// /// This function returns the maximum number of store operations permitted /// to replace a call to llvm.memmove. The value is set by the target at the /// performance threshold for such a replacement. If OptSize is true, /// return the limit for functions that have OptSize attribute. unsigned getMaxStoresPerMemmove(bool OptSize) const { return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove; } /// Determine if the target supports unaligned memory accesses. /// /// This function returns true if the target allows unaligned memory accesses /// of the specified type in the given address space. If true, it also returns /// whether the unaligned memory access is "fast" in the last argument by /// reference. This is used, for example, in situations where an array /// copy/move/set is converted to a sequence of store operations. Its use /// helps to ensure that such replacements don't generate code that causes an /// alignment error (trap) on the target machine. virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace = 0, unsigned Align = 1, bool * /*Fast*/ = nullptr) const { return false; } /// Return true if the target supports a memory access of this type for the /// given address space and alignment. If the access is allowed, the optional /// final parameter returns if the access is also fast (as defined by the /// target). bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace = 0, unsigned Alignment = 1, bool *Fast = nullptr) const; /// Returns the target specific optimal type for load and store operations as /// a result of memset, memcpy, and memmove lowering. /// /// If DstAlign is zero that means it's safe to destination alignment can /// satisfy any constraint. Similarly if SrcAlign is zero it means there isn't /// a need to check it against alignment requirement, probably because the /// source does not need to be loaded. If 'IsMemset' is true, that means it's /// expanding a memset. If 'ZeroMemset' is true, that means it's a memset of /// zero. 'MemcpyStrSrc' indicates whether the memcpy source is constant so it /// does not need to be loaded. It returns EVT::Other if the type should be /// determined using generic target-independent logic. virtual EVT getOptimalMemOpType(uint64_t /*Size*/, unsigned /*DstAlign*/, unsigned /*SrcAlign*/, bool /*IsMemset*/, bool /*ZeroMemset*/, bool /*MemcpyStrSrc*/, MachineFunction &/*MF*/) const { return MVT::Other; } /// Returns true if it's safe to use load / store of the specified type to /// expand memcpy / memset inline. /// /// This is mostly true for all types except for some special cases. For /// example, on X86 targets without SSE2 f64 load / store are done with fldl / /// fstpl which also does type conversion. Note the specified type doesn't /// have to be legal as the hook is used before type legalization. virtual bool isSafeMemOpType(MVT /*VT*/) const { return true; } /// Determine if we should use _setjmp or setjmp to implement llvm.setjmp. bool usesUnderscoreSetJmp() const { return UseUnderscoreSetJmp; } /// Determine if we should use _longjmp or longjmp to implement llvm.longjmp. bool usesUnderscoreLongJmp() const { return UseUnderscoreLongJmp; } /// Return lower limit for number of blocks in a jump table. virtual unsigned getMinimumJumpTableEntries() const; /// Return lower limit of the density in a jump table. unsigned getMinimumJumpTableDensity(bool OptForSize) const; /// Return upper limit for number of entries in a jump table. /// Zero if no limit. unsigned getMaximumJumpTableSize() const; virtual bool isJumpTableRelative() const { return TM.isPositionIndependent(); } /// If a physical register, this specifies the register that /// llvm.savestack/llvm.restorestack should save and restore. unsigned getStackPointerRegisterToSaveRestore() const { return StackPointerRegisterToSaveRestore; } /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. virtual unsigned getExceptionPointerRegister(const Constant *PersonalityFn) const { // 0 is guaranteed to be the NoRegister value on all targets return 0; } /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. virtual unsigned getExceptionSelectorRegister(const Constant *PersonalityFn) const { // 0 is guaranteed to be the NoRegister value on all targets return 0; } virtual bool needsFixedCatchObjects() const { report_fatal_error("Funclet EH is not implemented for this target"); } /// Returns the target's jmp_buf size in bytes (if never set, the default is /// 200) unsigned getJumpBufSize() const { return JumpBufSize; } /// Returns the target's jmp_buf alignment in bytes (if never set, the default /// is 0) unsigned getJumpBufAlignment() const { return JumpBufAlignment; } /// Return the minimum stack alignment of an argument. unsigned getMinStackArgumentAlignment() const { return MinStackArgumentAlignment; } /// Return the minimum function alignment. unsigned getMinFunctionAlignment() const { return MinFunctionAlignment; } /// Return the preferred function alignment. unsigned getPrefFunctionAlignment() const { return PrefFunctionAlignment; } /// Return the preferred loop alignment. virtual unsigned getPrefLoopAlignment(MachineLoop *ML = nullptr) const { return PrefLoopAlignment; } /// If the target has a standard location for the stack protector guard, /// returns the address of that location. Otherwise, returns nullptr. /// DEPRECATED: please override useLoadStackGuardNode and customize /// LOAD_STACK_GUARD, or customize \@llvm.stackguard(). virtual Value *getIRStackGuard(IRBuilder<> &IRB) const; /// Inserts necessary declarations for SSP (stack protection) purpose. /// Should be used only when getIRStackGuard returns nullptr. virtual void insertSSPDeclarations(Module &M) const; /// Return the variable that's previously inserted by insertSSPDeclarations, /// if any, otherwise return nullptr. Should be used only when /// getIRStackGuard returns nullptr. virtual Value *getSDagStackGuard(const Module &M) const; /// If this function returns true, stack protection checks should XOR the /// frame pointer (or whichever pointer is used to address locals) into the /// stack guard value before checking it. getIRStackGuard must return nullptr /// if this returns true. virtual bool useStackGuardXorFP() const { return false; } /// If the target has a standard stack protection check function that /// performs validation and error handling, returns the function. Otherwise, /// returns nullptr. Must be previously inserted by insertSSPDeclarations. /// Should be used only when getIRStackGuard returns nullptr. virtual Value *getSSPStackGuardCheck(const Module &M) const; protected: Value *getDefaultSafeStackPointerLocation(IRBuilder<> &IRB, bool UseTLS) const; public: /// Returns the target-specific address of the unsafe stack pointer. virtual Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const; /// Returns the name of the symbol used to emit stack probes or the empty /// string if not applicable. virtual StringRef getStackProbeSymbolName(MachineFunction &MF) const { return ""; } /// Returns true if a cast between SrcAS and DestAS is a noop. virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { return false; } /// Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g. we /// are happy to sink it into basic blocks. virtual bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { return isNoopAddrSpaceCast(SrcAS, DestAS); } /// Return true if the pointer arguments to CI should be aligned by aligning /// the object whose address is being passed. If so then MinSize is set to the /// minimum size the object must be to be aligned and PrefAlign is set to the /// preferred alignment. virtual bool shouldAlignPointerArgs(CallInst * /*CI*/, unsigned & /*MinSize*/, unsigned & /*PrefAlign*/) const { return false; } //===--------------------------------------------------------------------===// /// \name Helpers for TargetTransformInfo implementations /// @{ /// Get the ISD node that corresponds to the Instruction class opcode. int InstructionOpcodeToISD(unsigned Opcode) const; /// Estimate the cost of type-legalization and the legalized type. std::pair getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const; /// @} //===--------------------------------------------------------------------===// /// \name Helpers for atomic expansion. /// @{ /// Returns the maximum atomic operation size (in bits) supported by /// the backend. Atomic operations greater than this size (as well /// as ones that are not naturally aligned), will be expanded by /// AtomicExpandPass into an __atomic_* library call. unsigned getMaxAtomicSizeInBitsSupported() const { return MaxAtomicSizeInBitsSupported; } /// Returns the size of the smallest cmpxchg or ll/sc instruction /// the backend supports. Any smaller operations are widened in /// AtomicExpandPass. /// /// Note that *unlike* operations above the maximum size, atomic ops /// are still natively supported below the minimum; they just /// require a more complex expansion. unsigned getMinCmpXchgSizeInBits() const { return MinCmpXchgSizeInBits; } /// Whether the target supports unaligned atomic operations. bool supportsUnalignedAtomics() const { return SupportsUnalignedAtomics; } /// Whether AtomicExpandPass should automatically insert fences and reduce /// ordering for this atomic. This should be true for most architectures with /// weak memory ordering. Defaults to false. virtual bool shouldInsertFencesForAtomic(const Instruction *I) const { return false; } /// Perform a load-linked operation on Addr, returning a "Value *" with the /// corresponding pointee type. This may entail some non-trivial operations to /// truncate or reconstruct types that will be illegal in the backend. See /// ARMISelLowering for an example implementation. virtual Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { llvm_unreachable("Load linked unimplemented on this target"); } /// Perform a store-conditional operation to Addr. Return the status of the /// store. This should be 0 if the store succeeded, non-zero otherwise. virtual Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { llvm_unreachable("Store conditional unimplemented on this target"); } /// Inserts in the IR a target-specific intrinsic specifying a fence. /// It is called by AtomicExpandPass before expanding an /// AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad /// if shouldInsertFencesForAtomic returns true. /// /// Inst is the original atomic instruction, prior to other expansions that /// may be performed. /// /// This function should either return a nullptr, or a pointer to an IR-level /// Instruction*. Even complex fence sequences can be represented by a /// single Instruction* through an intrinsic to be lowered later. /// Backends should override this method to produce target-specific intrinsic /// for their fences. /// FIXME: Please note that the default implementation here in terms of /// IR-level fences exists for historical/compatibility reasons and is /// *unsound* ! Fences cannot, in general, be used to restore sequential /// consistency. For example, consider the following example: /// atomic x = y = 0; /// int r1, r2, r3, r4; /// Thread 0: /// x.store(1); /// Thread 1: /// y.store(1); /// Thread 2: /// r1 = x.load(); /// r2 = y.load(); /// Thread 3: /// r3 = y.load(); /// r4 = x.load(); /// r1 = r3 = 1 and r2 = r4 = 0 is impossible as long as the accesses are all /// seq_cst. But if they are lowered to monotonic accesses, no amount of /// IR-level fences can prevent it. /// @{ virtual Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore()) return Builder.CreateFence(Ord); else return nullptr; } virtual Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { if (isAcquireOrStronger(Ord)) return Builder.CreateFence(Ord); else return nullptr; } /// @} // Emits code that executes when the comparison result in the ll/sc // expansion of a cmpxchg instruction is such that the store-conditional will // not execute. This makes it possible to balance out the load-linked with // a dedicated instruction, if desired. // E.g., on ARM, if ldrex isn't followed by strex, the exclusive monitor would // be unnecessarily held, except if clrex, inserted by this hook, is executed. virtual void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const {} /// Returns true if the given (atomic) store should be expanded by the /// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input. virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const { return false; } /// Returns true if arguments should be sign-extended in lib calls. virtual bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const { return IsSigned; } /// Returns how the given (atomic) load should be expanded by the /// IR-level AtomicExpand pass. virtual AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const { return AtomicExpansionKind::None; } /// Returns true if the given atomic cmpxchg should be expanded by the /// IR-level AtomicExpand pass into a load-linked/store-conditional sequence /// (through emitLoadLinked() and emitStoreConditional()). virtual bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const { return false; } /// Returns how the IR-level AtomicExpand pass should expand the given /// AtomicRMW, if at all. Default is to never expand. virtual AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const { return AtomicExpansionKind::None; } /// On some platforms, an AtomicRMW that never actually modifies the value /// (such as fetch_add of 0) can be turned into a fence followed by an /// atomic load. This may sound useless, but it makes it possible for the /// processor to keep the cacheline shared, dramatically improving /// performance. And such idempotent RMWs are useful for implementing some /// kinds of locks, see for example (justification + benchmarks): /// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf /// This method tries doing that transformation, returning the atomic load if /// it succeeds, and nullptr otherwise. /// If shouldExpandAtomicLoadInIR returns true on that load, it will undergo /// another round of expansion. virtual LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *RMWI) const { return nullptr; } /// Returns how the platform's atomic operations are extended (ZERO_EXTEND, /// SIGN_EXTEND, or ANY_EXTEND). virtual ISD::NodeType getExtendForAtomicOps() const { return ISD::ZERO_EXTEND; } /// @} /// Returns true if we should normalize /// select(N0&N1, X, Y) => select(N0, select(N1, X, Y), Y) and /// select(N0|N1, X, Y) => select(N0, select(N1, X, Y, Y)) if it is likely /// that it saves us from materializing N0 and N1 in an integer register. /// Targets that are able to perform and/or on flags should return false here. virtual bool shouldNormalizeToSelectSequence(LLVMContext &Context, EVT VT) const { // If a target has multiple condition registers, then it likely has logical // operations on those registers. if (hasMultipleConditionRegisters()) return false; // Only do the transform if the value won't be split into multiple // registers. LegalizeTypeAction Action = getTypeAction(Context, VT); return Action != TypeExpandInteger && Action != TypeExpandFloat && Action != TypeSplitVector; } /// Return true if a select of constants (select Cond, C1, C2) should be /// transformed into simple math ops with the condition value. For example: /// select Cond, C1, C1-1 --> add (zext Cond), C1-1 virtual bool convertSelectOfConstantsToMath(EVT VT) const { return false; } //===--------------------------------------------------------------------===// // TargetLowering Configuration Methods - These methods should be invoked by // the derived class constructor to configure this object for the target. // protected: /// Specify how the target extends the result of integer and floating point /// boolean values from i1 to a wider type. See getBooleanContents. void setBooleanContents(BooleanContent Ty) { BooleanContents = Ty; BooleanFloatContents = Ty; } /// Specify how the target extends the result of integer and floating point /// boolean values from i1 to a wider type. See getBooleanContents. void setBooleanContents(BooleanContent IntTy, BooleanContent FloatTy) { BooleanContents = IntTy; BooleanFloatContents = FloatTy; } /// Specify how the target extends the result of a vector boolean value from a /// vector of i1 to a wider type. See getBooleanContents. void setBooleanVectorContents(BooleanContent Ty) { BooleanVectorContents = Ty; } /// Specify the target scheduling preference. void setSchedulingPreference(Sched::Preference Pref) { SchedPreferenceInfo = Pref; } /// Indicate whether this target prefers to use _setjmp to implement /// llvm.setjmp or the version without _. Defaults to false. void setUseUnderscoreSetJmp(bool Val) { UseUnderscoreSetJmp = Val; } /// Indicate whether this target prefers to use _longjmp to implement /// llvm.longjmp or the version without _. Defaults to false. void setUseUnderscoreLongJmp(bool Val) { UseUnderscoreLongJmp = Val; } /// Indicate the minimum number of blocks to generate jump tables. void setMinimumJumpTableEntries(unsigned Val); /// Indicate the maximum number of entries in jump tables. /// Set to zero to generate unlimited jump tables. void setMaximumJumpTableSize(unsigned); /// If set to a physical register, this specifies the register that /// llvm.savestack/llvm.restorestack should save and restore. void setStackPointerRegisterToSaveRestore(unsigned R) { StackPointerRegisterToSaveRestore = R; } /// Tells the code generator that the target has multiple (allocatable) /// condition registers that can be used to store the results of comparisons /// for use by selects and conditional branches. With multiple condition /// registers, the code generator will not aggressively sink comparisons into /// the blocks of their users. void setHasMultipleConditionRegisters(bool hasManyRegs = true) { HasMultipleConditionRegisters = hasManyRegs; } /// Tells the code generator that the target has BitExtract instructions. /// The code generator will aggressively sink "shift"s into the blocks of /// their users if the users will generate "and" instructions which can be /// combined with "shift" to BitExtract instructions. void setHasExtractBitsInsn(bool hasExtractInsn = true) { HasExtractBitsInsn = hasExtractInsn; } /// Tells the code generator not to expand logic operations on comparison /// predicates into separate sequences that increase the amount of flow /// control. void setJumpIsExpensive(bool isExpensive = true); /// Tells the code generator that this target supports floating point /// exceptions and cares about preserving floating point exception behavior. void setHasFloatingPointExceptions(bool FPExceptions = true) { HasFloatingPointExceptions = FPExceptions; } /// Tells the code generator which bitwidths to bypass. void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth) { BypassSlowDivWidths[SlowBitWidth] = FastBitWidth; } /// Add the specified register class as an available regclass for the /// specified value type. This indicates the selector can handle values of /// that class natively. void addRegisterClass(MVT VT, const TargetRegisterClass *RC) { assert((unsigned)VT.SimpleTy < array_lengthof(RegClassForVT)); RegClassForVT[VT.SimpleTy] = RC; } /// Return the largest legal super-reg register class of the register class /// for the specified type and its associated "cost". virtual std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const; /// Once all of the register classes are added, this allows us to compute /// derived properties we expose. void computeRegisterProperties(const TargetRegisterInfo *TRI); /// Indicate that the specified operation does not work with the specified /// type and indicate what to do about it. Note that VT may refer to either /// the type of a result or that of an operand of Op. void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action) { assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!"); OpActions[(unsigned)VT.SimpleTy][Op] = Action; } /// Indicate that the specified load with extension does not work with the /// specified type and indicate what to do about it. void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action) { assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!"); assert((unsigned)Action < 0x10 && "too many bits for bitfield array"); unsigned Shift = 4 * ExtType; LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] &= ~((uint16_t)0xF << Shift); LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] |= (uint16_t)Action << Shift; } /// Indicate that the specified truncating store does not work with the /// specified type and indicate what to do about it. void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action) { assert(ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!"); TruncStoreActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy] = Action; } /// Indicate that the specified indexed load does or does not work with the /// specified type and indicate what to do abort it. /// /// NOTE: All indexed mode loads are initialized to Expand in /// TargetLowering.cpp void setIndexedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action) { assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE && (unsigned)Action < 0xf && "Table isn't big enough!"); // Load action are kept in the upper half. IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0xf0; IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action) <<4; } /// Indicate that the specified indexed store does or does not work with the /// specified type and indicate what to do about it. /// /// NOTE: All indexed mode stores are initialized to Expand in /// TargetLowering.cpp void setIndexedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action) { assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE && (unsigned)Action < 0xf && "Table isn't big enough!"); // Store action are kept in the lower half. IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0x0f; IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action); } /// Indicate that the specified condition code is or isn't supported on the /// target and indicate what to do about it. void setCondCodeAction(ISD::CondCode CC, MVT VT, LegalizeAction Action) { assert(VT.isValid() && (unsigned)CC < array_lengthof(CondCodeActions) && "Table isn't big enough!"); assert((unsigned)Action < 0x10 && "too many bits for bitfield array"); /// The lower 3 bits of the SimpleTy index into Nth 4bit set from the 32-bit /// value and the upper 29 bits index into the second dimension of the array /// to select what 32-bit value to use. uint32_t Shift = 4 * (VT.SimpleTy & 0x7); CondCodeActions[CC][VT.SimpleTy >> 3] &= ~((uint32_t)0xF << Shift); CondCodeActions[CC][VT.SimpleTy >> 3] |= (uint32_t)Action << Shift; } /// If Opc/OrigVT is specified as being promoted, the promotion code defaults /// to trying a larger integer/fp until it can find one that works. If that /// default is insufficient, this method can be used by the target to override /// the default. void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT) { PromoteToType[std::make_pair(Opc, OrigVT.SimpleTy)] = DestVT.SimpleTy; } /// Convenience method to set an operation to Promote and specify the type /// in a single call. void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT) { setOperationAction(Opc, OrigVT, Promote); AddPromotedToType(Opc, OrigVT, DestVT); } /// Targets should invoke this method for each target independent node that /// they want to provide a custom DAG combiner for by implementing the /// PerformDAGCombine virtual method. void setTargetDAGCombine(ISD::NodeType NT) { assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray)); TargetDAGCombineArray[NT >> 3] |= 1 << (NT&7); } /// Set the target's required jmp_buf buffer size (in bytes); default is 200 void setJumpBufSize(unsigned Size) { JumpBufSize = Size; } /// Set the target's required jmp_buf buffer alignment (in bytes); default is /// 0 void setJumpBufAlignment(unsigned Align) { JumpBufAlignment = Align; } /// Set the target's minimum function alignment (in log2(bytes)) void setMinFunctionAlignment(unsigned Align) { MinFunctionAlignment = Align; } /// Set the target's preferred function alignment. This should be set if /// there is a performance benefit to higher-than-minimum alignment (in /// log2(bytes)) void setPrefFunctionAlignment(unsigned Align) { PrefFunctionAlignment = Align; } /// Set the target's preferred loop alignment. Default alignment is zero, it /// means the target does not care about loop alignment. The alignment is /// specified in log2(bytes). The target may also override /// getPrefLoopAlignment to provide per-loop values. void setPrefLoopAlignment(unsigned Align) { PrefLoopAlignment = Align; } /// Set the minimum stack alignment of an argument (in log2(bytes)). void setMinStackArgumentAlignment(unsigned Align) { MinStackArgumentAlignment = Align; } /// Set the maximum atomic operation size supported by the /// backend. Atomic operations greater than this size (as well as /// ones that are not naturally aligned), will be expanded by /// AtomicExpandPass into an __atomic_* library call. void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits) { MaxAtomicSizeInBitsSupported = SizeInBits; } /// Sets the minimum cmpxchg or ll/sc size supported by the backend. void setMinCmpXchgSizeInBits(unsigned SizeInBits) { MinCmpXchgSizeInBits = SizeInBits; } /// Sets whether unaligned atomic operations are supported. void setSupportsUnalignedAtomics(bool UnalignedSupported) { SupportsUnalignedAtomics = UnalignedSupported; } public: //===--------------------------------------------------------------------===// // Addressing mode description hooks (used by LSR etc). // /// CodeGenPrepare sinks address calculations into the same BB as Load/Store /// instructions reading the address. This allows as much computation as /// possible to be done in the address mode for that operand. This hook lets /// targets also pass back when this should be done on intrinsics which /// load/store. virtual bool getAddrModeArguments(IntrinsicInst * /*I*/, SmallVectorImpl &/*Ops*/, Type *&/*AccessTy*/) const { return false; } /// This represents an addressing mode of: /// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg /// If BaseGV is null, there is no BaseGV. /// If BaseOffs is zero, there is no base offset. /// If HasBaseReg is false, there is no base register. /// If Scale is zero, there is no ScaleReg. Scale of 1 indicates a reg with /// no scale. struct AddrMode { GlobalValue *BaseGV = nullptr; int64_t BaseOffs = 0; bool HasBaseReg = false; int64_t Scale = 0; AddrMode() = default; }; /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. /// /// The type may be VoidTy, in which case only return true if the addressing /// mode is legal for a load/store of any legal type. TODO: Handle /// pre/postinc as well. /// /// If the address space cannot be determined, it will be -1. /// /// TODO: Remove default argument virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I = nullptr) const; /// Return the cost of the scaling factor used in the addressing mode /// represented by AM for this target, for a load/store of the specified type. /// /// If the AM is supported, the return value must be >= 0. /// If the AM is not supported, it returns a negative value. /// TODO: Handle pre/postinc as well. /// TODO: Remove default argument virtual int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS = 0) const { // Default: assume that any scaling factor used in a legal AM is free. if (isLegalAddressingMode(DL, AM, Ty, AS)) return 0; return -1; } /// Return true if the specified immediate is legal icmp immediate, that is /// the target has icmp instructions which can compare a register against the /// immediate without having to materialize the immediate into a register. virtual bool isLegalICmpImmediate(int64_t) const { return true; } /// Return true if the specified immediate is legal add immediate, that is the /// target has add instructions which can add a register with the immediate /// without having to materialize the immediate into a register. virtual bool isLegalAddImmediate(int64_t) const { return true; } /// Return true if it's significantly cheaper to shift a vector by a uniform /// scalar than by an amount which will vary across each lane. On x86, for /// example, there is a "psllw" instruction for the former case, but no simple /// instruction for a general "a << b" operation on vectors. virtual bool isVectorShiftByScalarCheap(Type *Ty) const { return false; } /// Returns true if the opcode is a commutative binary operation. virtual bool isCommutativeBinOp(unsigned Opcode) const { // FIXME: This should get its info from the td file. switch (Opcode) { case ISD::ADD: case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: case ISD::MUL: case ISD::MULHU: case ISD::MULHS: case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: case ISD::FADD: case ISD::FMUL: case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::SADDO: case ISD::UADDO: case ISD::ADDC: case ISD::ADDE: case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FMINNAN: case ISD::FMAXNAN: return true; default: return false; } } /// Return true if it's free to truncate a value of type FromTy to type /// ToTy. e.g. On x86 it's free to truncate a i32 value in register EAX to i16 /// by referencing its sub-register AX. /// Targets must return false when FromTy <= ToTy. virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const { return false; } /// Return true if a truncation from FromTy to ToTy is permitted when deciding /// whether a call is in tail position. Typically this means that both results /// would be assigned to the same register or stack slot, but it could mean /// the target performs adequate checks of its own before proceeding with the /// tail call. Targets must return false when FromTy <= ToTy. virtual bool allowTruncateForTailCall(Type *FromTy, Type *ToTy) const { return false; } virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const { return false; } virtual bool isProfitableToHoist(Instruction *I) const { return true; } /// Return true if the extension represented by \p I is free. /// Unlikely the is[Z|FP]ExtFree family which is based on types, /// this method can use the context provided by \p I to decide /// whether or not \p I is free. /// This method extends the behavior of the is[Z|FP]ExtFree family. /// In other words, if is[Z|FP]Free returns true, then this method /// returns true as well. The converse is not true. /// The target can perform the adequate checks by overriding isExtFreeImpl. /// \pre \p I must be a sign, zero, or fp extension. bool isExtFree(const Instruction *I) const { switch (I->getOpcode()) { case Instruction::FPExt: if (isFPExtFree(EVT::getEVT(I->getType()), EVT::getEVT(I->getOperand(0)->getType()))) return true; break; case Instruction::ZExt: if (isZExtFree(I->getOperand(0)->getType(), I->getType())) return true; break; case Instruction::SExt: break; default: llvm_unreachable("Instruction is not an extension"); } return isExtFreeImpl(I); } /// Return true if \p Load and \p Ext can form an ExtLoad. /// For example, in AArch64 /// %L = load i8, i8* %ptr /// %E = zext i8 %L to i32 /// can be lowered into one load instruction /// ldrb w0, [x0] bool isExtLoad(const LoadInst *Load, const Instruction *Ext, const DataLayout &DL) const { EVT VT = getValueType(DL, Ext->getType()); EVT LoadVT = getValueType(DL, Load->getType()); // If the load has other users and the truncate is not free, the ext // probably isn't free. if (!Load->hasOneUse() && (isTypeLegal(LoadVT) || !isTypeLegal(VT)) && !isTruncateFree(Ext->getType(), Load->getType())) return false; // Check whether the target supports casts folded into loads. unsigned LType; if (isa(Ext)) LType = ISD::ZEXTLOAD; else { assert(isa(Ext) && "Unexpected ext type!"); LType = ISD::SEXTLOAD; } return isLoadExtLegal(LType, VT, LoadVT); } /// Return true if any actual instruction that defines a value of type FromTy /// implicitly zero-extends the value to ToTy in the result register. /// /// The function should return true when it is likely that the truncate can /// be freely folded with an instruction defining a value of FromTy. If /// the defining instruction is unknown (because you're looking at a /// function argument, PHI, etc.) then the target may require an /// explicit truncate, which is not necessarily free, but this function /// does not deal with those cases. /// Targets must return false when FromTy >= ToTy. virtual bool isZExtFree(Type *FromTy, Type *ToTy) const { return false; } virtual bool isZExtFree(EVT FromTy, EVT ToTy) const { return false; } /// Return true if the target supplies and combines to a paired load /// two loaded values of type LoadedType next to each other in memory. /// RequiredAlignment gives the minimal alignment constraints that must be met /// to be able to select this paired load. /// /// This information is *not* used to generate actual paired loads, but it is /// used to generate a sequence of loads that is easier to combine into a /// paired load. /// For instance, something like this: /// a = load i64* addr /// b = trunc i64 a to i32 /// c = lshr i64 a, 32 /// d = trunc i64 c to i32 /// will be optimized into: /// b = load i32* addr1 /// d = load i32* addr2 /// Where addr1 = addr2 +/- sizeof(i32). /// /// In other words, unless the target performs a post-isel load combining, /// this information should not be provided because it will generate more /// loads. virtual bool hasPairedLoad(EVT /*LoadedType*/, unsigned & /*RequiredAlignment*/) const { return false; } /// Return true if the target has a vector blend instruction. virtual bool hasVectorBlend() const { return false; } /// Get the maximum supported factor for interleaved memory accesses. /// Default to be the minimum interleave factor: 2. virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; } /// Lower an interleaved load to target specific intrinsics. Return /// true on success. /// /// \p LI is the vector load instruction. /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector. /// \p Indices is the corresponding indices for each shufflevector. /// \p Factor is the interleave factor. virtual bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { return false; } /// Lower an interleaved store to target specific intrinsics. Return /// true on success. /// /// \p SI is the vector store instruction. /// \p SVI is the shufflevector to RE-interleave the stored vector. /// \p Factor is the interleave factor. virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { return false; } /// Return true if zero-extending the specific node Val to type VT2 is free /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or /// because it's folded such as X86 zero-extending loads). virtual bool isZExtFree(SDValue Val, EVT VT2) const { return isZExtFree(Val.getValueType(), VT2); } /// Return true if an fpext operation is free (for instance, because /// single-precision floating-point numbers are implicitly extended to /// double-precision). virtual bool isFPExtFree(EVT DestVT, EVT SrcVT) const { assert(SrcVT.isFloatingPoint() && DestVT.isFloatingPoint() && "invalid fpext types"); return false; } /// Return true if an fpext operation input to an \p Opcode operation is free /// (for instance, because half-precision floating-point numbers are /// implicitly extended to float-precision) for an FMA instruction. virtual bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const { assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() && "invalid fpext types"); return isFPExtFree(DestVT, SrcVT); } /// Return true if folding a vector load into ExtVal (a sign, zero, or any /// extend node) is profitable. virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; } /// Return true if an fneg operation is free to the point where it is never /// worthwhile to replace it with a bitwise operation. virtual bool isFNegFree(EVT VT) const { assert(VT.isFloatingPoint()); return false; } /// Return true if an fabs operation is free to the point where it is never /// worthwhile to replace it with a bitwise operation. virtual bool isFAbsFree(EVT VT) const { assert(VT.isFloatingPoint()); return false; } /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this method /// returns true, otherwise fmuladd is expanded to fmul + fadd. /// /// NOTE: This may be called before legalization on types for which FMAs are /// not legal, but should return true if those types will eventually legalize /// to types that support FMAs. After legalization, it will only be called on /// types that support FMAs (via Legal or Custom actions) virtual bool isFMAFasterThanFMulAndFAdd(EVT) const { return false; } /// Return true if it's profitable to narrow operations of type VT1 to /// VT2. e.g. on x86, it's profitable to narrow from i32 to i8 but not from /// i32 to i16. virtual bool isNarrowingProfitable(EVT /*VT1*/, EVT /*VT2*/) const { return false; } /// Return true if it is beneficial to convert a load of a constant to /// just the constant itself. /// On some targets it might be more efficient to use a combination of /// arithmetic instructions to materialize the constant instead of loading it /// from a constant pool. virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { return false; } /// Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type /// from this source type with this index. This is needed because /// EXTRACT_SUBVECTOR usually has custom lowering that depends on the index of /// the first element, and only the target knows which lowering is cheap. virtual bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { return false; } // Return true if it is profitable to use a scalar input to a BUILD_VECTOR // even if the vector itself has multiple uses. virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const { return false; } // Return true if CodeGenPrepare should consider splitting large offset of a // GEP to make the GEP fit into the addressing mode and can be sunk into the // same blocks of its users. virtual bool shouldConsiderGEPOffsetSplit() const { return false; } //===--------------------------------------------------------------------===// // Runtime Library hooks // /// Rename the default libcall routine name for the specified libcall. void setLibcallName(RTLIB::Libcall Call, const char *Name) { LibcallRoutineNames[Call] = Name; } /// Get the libcall routine name for the specified libcall. const char *getLibcallName(RTLIB::Libcall Call) const { return LibcallRoutineNames[Call]; } /// Override the default CondCode to be used to test the result of the /// comparison libcall against zero. void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) { CmpLibcallCCs[Call] = CC; } /// Get the CondCode that's to be used to test the result of the comparison /// libcall against zero. ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const { return CmpLibcallCCs[Call]; } /// Set the CallingConv that should be used for the specified libcall. void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) { LibcallCallingConvs[Call] = CC; } /// Get the CallingConv that should be used for the specified libcall. CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const { return LibcallCallingConvs[Call]; } /// Execute target specific actions to finalize target lowering. /// This is used to set extra flags in MachineFrameInformation and freezing /// the set of reserved registers. /// The default implementation just freezes the set of reserved registers. virtual void finalizeLowering(MachineFunction &MF) const; private: const TargetMachine &TM; /// Tells the code generator that the target has multiple (allocatable) /// condition registers that can be used to store the results of comparisons /// for use by selects and conditional branches. With multiple condition /// registers, the code generator will not aggressively sink comparisons into /// the blocks of their users. bool HasMultipleConditionRegisters; /// Tells the code generator that the target has BitExtract instructions. /// The code generator will aggressively sink "shift"s into the blocks of /// their users if the users will generate "and" instructions which can be /// combined with "shift" to BitExtract instructions. bool HasExtractBitsInsn; /// Tells the code generator to bypass slow divide or remainder /// instructions. For example, BypassSlowDivWidths[32,8] tells the code /// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer /// div/rem when the operands are positive and less than 256. DenseMap BypassSlowDivWidths; /// Tells the code generator that it shouldn't generate extra flow control /// instructions and should attempt to combine flow control instructions via /// predication. bool JumpIsExpensive; /// Whether the target supports or cares about preserving floating point /// exception behavior. bool HasFloatingPointExceptions; /// This target prefers to use _setjmp to implement llvm.setjmp. /// /// Defaults to false. bool UseUnderscoreSetJmp; /// This target prefers to use _longjmp to implement llvm.longjmp. /// /// Defaults to false. bool UseUnderscoreLongJmp; /// Information about the contents of the high-bits in boolean values held in /// a type wider than i1. See getBooleanContents. BooleanContent BooleanContents; /// Information about the contents of the high-bits in boolean values held in /// a type wider than i1. See getBooleanContents. BooleanContent BooleanFloatContents; /// Information about the contents of the high-bits in boolean vector values /// when the element type is wider than i1. See getBooleanContents. BooleanContent BooleanVectorContents; /// The target scheduling preference: shortest possible total cycles or lowest /// register usage. Sched::Preference SchedPreferenceInfo; /// The size, in bytes, of the target's jmp_buf buffers unsigned JumpBufSize; /// The alignment, in bytes, of the target's jmp_buf buffers unsigned JumpBufAlignment; /// The minimum alignment that any argument on the stack needs to have. unsigned MinStackArgumentAlignment; /// The minimum function alignment (used when optimizing for size, and to /// prevent explicitly provided alignment from leading to incorrect code). unsigned MinFunctionAlignment; /// The preferred function alignment (used when alignment unspecified and /// optimizing for speed). unsigned PrefFunctionAlignment; /// The preferred loop alignment. unsigned PrefLoopAlignment; /// Size in bits of the maximum atomics size the backend supports. /// Accesses larger than this will be expanded by AtomicExpandPass. unsigned MaxAtomicSizeInBitsSupported; /// Size in bits of the minimum cmpxchg or ll/sc operation the /// backend supports. unsigned MinCmpXchgSizeInBits; /// This indicates if the target supports unaligned atomic operations. bool SupportsUnalignedAtomics; /// If set to a physical register, this specifies the register that /// llvm.savestack/llvm.restorestack should save and restore. unsigned StackPointerRegisterToSaveRestore; /// This indicates the default register class to use for each ValueType the /// target supports natively. const TargetRegisterClass *RegClassForVT[MVT::LAST_VALUETYPE]; unsigned char NumRegistersForVT[MVT::LAST_VALUETYPE]; MVT RegisterTypeForVT[MVT::LAST_VALUETYPE]; /// This indicates the "representative" register class to use for each /// ValueType the target supports natively. This information is used by the /// scheduler to track register pressure. By default, the representative /// register class is the largest legal super-reg register class of the /// register class of the specified type. e.g. On x86, i8, i16, and i32's /// representative class would be GR32. const TargetRegisterClass *RepRegClassForVT[MVT::LAST_VALUETYPE]; /// This indicates the "cost" of the "representative" register class for each /// ValueType. The cost is used by the scheduler to approximate register /// pressure. uint8_t RepRegClassCostForVT[MVT::LAST_VALUETYPE]; /// For any value types we are promoting or expanding, this contains the value /// type that we are changing to. For Expanded types, this contains one step /// of the expand (e.g. i64 -> i32), even if there are multiple steps required /// (e.g. i64 -> i16). For types natively supported by the system, this holds /// the same type (e.g. i32 -> i32). MVT TransformToType[MVT::LAST_VALUETYPE]; /// For each operation and each value type, keep a LegalizeAction that /// indicates how instruction selection should deal with the operation. Most /// operations are Legal (aka, supported natively by the target), but /// operations that are not should be described. Note that operations on /// non-legal value types are not described here. LegalizeAction OpActions[MVT::LAST_VALUETYPE][ISD::BUILTIN_OP_END]; /// For each load extension type and each value type, keep a LegalizeAction /// that indicates how instruction selection should deal with a load of a /// specific value type and extension type. Uses 4-bits to store the action /// for each of the 4 load ext types. uint16_t LoadExtActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE]; /// For each value type pair keep a LegalizeAction that indicates whether a /// truncating store of a specific value type and truncating type is legal. LegalizeAction TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE]; /// For each indexed mode and each value type, keep a pair of LegalizeAction /// that indicates how instruction selection should deal with the load / /// store. /// /// The first dimension is the value_type for the reference. The second /// dimension represents the various modes for load store. uint8_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE]; /// For each condition code (ISD::CondCode) keep a LegalizeAction that /// indicates how instruction selection should deal with the condition code. /// /// Because each CC action takes up 4 bits, we need to have the array size be /// large enough to fit all of the value types. This can be done by rounding /// up the MVT::LAST_VALUETYPE value to the next multiple of 8. uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE + 7) / 8]; protected: ValueTypeActionImpl ValueTypeActions; private: LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const; /// Targets can specify ISD nodes that they would like PerformDAGCombine /// callbacks for by calling setTargetDAGCombine(), which sets a bit in this /// array. unsigned char TargetDAGCombineArray[(ISD::BUILTIN_OP_END+CHAR_BIT-1)/CHAR_BIT]; /// For operations that must be promoted to a specific type, this holds the /// destination type. This map should be sparse, so don't hold it as an /// array. /// /// Targets add entries to this map with AddPromotedToType(..), clients access /// this with getTypeToPromoteTo(..). std::map, MVT::SimpleValueType> PromoteToType; /// Stores the name each libcall. const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1]; /// The ISD::CondCode that should be used to test the result of each of the /// comparison libcall against zero. ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL]; /// Stores the CallingConv that should be used for each libcall. CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL]; /// Set default libcall names and calling conventions. void InitLibcalls(const Triple &TT); protected: /// Return true if the extension represented by \p I is free. /// \pre \p I is a sign, zero, or fp extension and /// is[Z|FP]ExtFree of the related types is not true. virtual bool isExtFreeImpl(const Instruction *I) const { return false; } /// Depth that GatherAllAliases should should continue looking for chain /// dependencies when trying to find a more preferable chain. As an /// approximation, this should be more than the number of consecutive stores /// expected to be merged. unsigned GatherAllAliasesMaxDepth; /// Specify maximum number of store instructions per memset call. /// /// When lowering \@llvm.memset this field specifies the maximum number of /// store operations that may be substituted for the call to memset. Targets /// must set this value based on the cost threshold for that target. Targets /// should assume that the memset will be done using as many of the largest /// store operations first, followed by smaller ones, if necessary, per /// alignment restrictions. For example, storing 9 bytes on a 32-bit machine /// with 16-bit alignment would result in four 2-byte stores and one 1-byte /// store. This only applies to setting a constant array of a constant size. unsigned MaxStoresPerMemset; /// Maximum number of stores operations that may be substituted for the call /// to memset, used for functions with OptSize attribute. unsigned MaxStoresPerMemsetOptSize; /// Specify maximum bytes of store instructions per memcpy call. /// /// When lowering \@llvm.memcpy this field specifies the maximum number of /// store operations that may be substituted for a call to memcpy. Targets /// must set this value based on the cost threshold for that target. Targets /// should assume that the memcpy will be done using as many of the largest /// store operations first, followed by smaller ones, if necessary, per /// alignment restrictions. For example, storing 7 bytes on a 32-bit machine /// with 32-bit alignment would result in one 4-byte store, a one 2-byte store /// and one 1-byte store. This only applies to copying a constant array of /// constant size. unsigned MaxStoresPerMemcpy; /// \brief Specify max number of store instructions to glue in inlined memcpy. /// /// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number /// of store instructions to keep together. This helps in pairing and // vectorization later on. unsigned MaxGluedStoresPerMemcpy = 0; /// Maximum number of store operations that may be substituted for a call to /// memcpy, used for functions with OptSize attribute. unsigned MaxStoresPerMemcpyOptSize; unsigned MaxLoadsPerMemcmp; unsigned MaxLoadsPerMemcmpOptSize; /// Specify maximum bytes of store instructions per memmove call. /// /// When lowering \@llvm.memmove this field specifies the maximum number of /// store instructions that may be substituted for a call to memmove. Targets /// must set this value based on the cost threshold for that target. Targets /// should assume that the memmove will be done using as many of the largest /// store operations first, followed by smaller ones, if necessary, per /// alignment restrictions. For example, moving 9 bytes on a 32-bit machine /// with 8-bit alignment would result in nine 1-byte stores. This only /// applies to copying a constant array of constant size. unsigned MaxStoresPerMemmove; /// Maximum number of store instructions that may be substituted for a call to /// memmove, used for functions with OptSize attribute. unsigned MaxStoresPerMemmoveOptSize; /// Tells the code generator that select is more expensive than a branch if /// the branch is usually predicted right. bool PredictableSelectIsExpensive; /// \see enableExtLdPromotion. bool EnableExtLdPromotion; /// Return true if the value types that can be represented by the specified /// register class are all legal. bool isLegalRC(const TargetRegisterInfo &TRI, const TargetRegisterClass &RC) const; /// Replace/modify any TargetFrameIndex operands with a targte-dependent /// sequence of memory operands that is recognized by PrologEpilogInserter. MachineBasicBlock *emitPatchPoint(MachineInstr &MI, MachineBasicBlock *MBB) const; /// Replace/modify the XRay custom event operands with target-dependent /// details. MachineBasicBlock *emitXRayCustomEvent(MachineInstr &MI, MachineBasicBlock *MBB) const; /// Replace/modify the XRay typed event operands with target-dependent /// details. MachineBasicBlock *emitXRayTypedEvent(MachineInstr &MI, MachineBasicBlock *MBB) const; }; /// This class defines information used to lower LLVM code to legal SelectionDAG /// operators that the target instruction selector can accept natively. /// /// This class also defines callbacks that targets must implement to lower /// target-specific constructs to SelectionDAG operators. class TargetLowering : public TargetLoweringBase { public: struct DAGCombinerInfo; TargetLowering(const TargetLowering &) = delete; TargetLowering &operator=(const TargetLowering &) = delete; /// NOTE: The TargetMachine owns TLOF. explicit TargetLowering(const TargetMachine &TM); bool isPositionIndependent() const; virtual bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, DivergenceAnalysis *DA) const { return false; } virtual bool isSDNodeAlwaysUniform(const SDNode * N) const { return false; } /// Returns true by value, base pointer and offset pointer and addressing mode /// by reference if the node's address can be legally represented as /// pre-indexed load / store address. virtual bool getPreIndexedAddressParts(SDNode * /*N*/, SDValue &/*Base*/, SDValue &/*Offset*/, ISD::MemIndexedMode &/*AM*/, SelectionDAG &/*DAG*/) const { return false; } /// Returns true by value, base pointer and offset pointer and addressing mode /// by reference if this node can be combined with a load / store to form a /// post-indexed load / store. virtual bool getPostIndexedAddressParts(SDNode * /*N*/, SDNode * /*Op*/, SDValue &/*Base*/, SDValue &/*Offset*/, ISD::MemIndexedMode &/*AM*/, SelectionDAG &/*DAG*/) const { return false; } /// Return the entry encoding for a jump table in the current function. The /// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum. virtual unsigned getJumpTableEncoding() const; virtual const MCExpr * LowerCustomJumpTableEntry(const MachineJumpTableInfo * /*MJTI*/, const MachineBasicBlock * /*MBB*/, unsigned /*uid*/, MCContext &/*Ctx*/) const { llvm_unreachable("Need to implement this hook if target has custom JTIs"); } /// Returns relocation base for the given PIC jumptable. virtual SDValue getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const; /// This returns the relocation base for the given PIC jumptable, the same as /// getPICJumpTableRelocBase, but as an MCExpr. virtual const MCExpr * getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const; /// Return true if folding a constant offset with the given GlobalAddress is /// legal. It is frequently not legal in PIC relocation models. virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const; bool isInTailCallPosition(SelectionDAG &DAG, SDNode *Node, SDValue &Chain) const; void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL) const; /// Returns a pair of (return value, chain). /// It is an error to pass RTLIB::UNKNOWN_LIBCALL as \p LC. std::pair makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT, ArrayRef Ops, bool isSigned, const SDLoc &dl, bool doesNotReturn = false, bool isReturnValueUsed = true) const; /// Check whether parameters to a call that are passed in callee saved /// registers are the same as from the calling function. This needs to be /// checked for tail call eligibility. bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl &ArgLocs, const SmallVectorImpl &OutVals) const; //===--------------------------------------------------------------------===// // TargetLowering Optimization Methods // /// A convenience struct that encapsulates a DAG, and two SDValues for /// returning information from TargetLowering to its clients that want to /// combine. struct TargetLoweringOpt { SelectionDAG &DAG; bool LegalTys; bool LegalOps; SDValue Old; SDValue New; explicit TargetLoweringOpt(SelectionDAG &InDAG, bool LT, bool LO) : DAG(InDAG), LegalTys(LT), LegalOps(LO) {} bool LegalTypes() const { return LegalTys; } bool LegalOperations() const { return LegalOps; } bool CombineTo(SDValue O, SDValue N) { Old = O; New = N; return true; } }; /// Check to see if the specified operand of the specified instruction is a /// constant integer. If so, check to see if there are any bits set in the /// constant that are not demanded. If so, shrink the constant and return /// true. bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const; // Target hook to do target-specific const optimization, which is called by // ShrinkDemandedConstant. This function should return true if the target // doesn't want ShrinkDemandedConstant to further optimize the constant. virtual bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { return false; } /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free. This /// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be /// generalized for targets with other types of implicit widening casts. bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded, TargetLoweringOpt &TLO) const; /// Helper for SimplifyDemandedBits that can simplify an operation with /// multiple uses. This function simplifies operand \p OpIdx of \p User and /// then updates \p User with the simplified version. No other uses of /// \p OpIdx are updated. If \p User is the only user of \p OpIdx, this /// function behaves exactly like function SimplifyDemandedBits declared /// below except that it also updates the DAG by calling /// DCI.CommitTargetLoweringOpt. bool SimplifyDemandedBits(SDNode *User, unsigned OpIdx, const APInt &Demanded, DAGCombinerInfo &DCI, TargetLoweringOpt &TLO) const; /// Look at Op. At this point, we know that only the DemandedMask bits of the /// result of Op are ever used downstream. If we can use this information to /// simplify Op, create a new simplified DAG node and return true, returning /// the original and new nodes in Old and New. Otherwise, analyze the /// expression and return a mask of KnownOne and KnownZero bits for the /// expression (used to simplify the caller). The KnownZero/One bits may only /// be accurate for those bits in the DemandedMask. /// \p AssumeSingleUse When this parameter is true, this function will /// attempt to simplify \p Op even if there are multiple uses. /// Callers are responsible for correctly updating the DAG based on the /// results of this function, because simply replacing replacing TLO.Old /// with TLO.New will be incorrect when this parameter is true and TLO.Old /// has multiple uses. bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth = 0, bool AssumeSingleUse = false) const; /// Helper wrapper around SimplifyDemandedBits bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask, DAGCombinerInfo &DCI) const; /// Look at Vector Op. At this point, we know that only the DemandedElts /// elements of the result of Op are ever used downstream. If we can use /// this information to simplify Op, create a new simplified DAG node and /// return true, storing the original and new nodes in TLO. /// Otherwise, analyze the expression and return a mask of KnownUndef and /// KnownZero elements for the expression (used to simplify the caller). /// The KnownUndef/Zero elements may only be accurate for those bits /// in the DemandedMask. /// \p AssumeSingleUse When this parameter is true, this function will /// attempt to simplify \p Op even if there are multiple uses. /// Callers are responsible for correctly updating the DAG based on the /// results of this function, because simply replacing replacing TLO.Old /// with TLO.New will be incorrect when this parameter is true and TLO.Old /// has multiple uses. bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0, bool AssumeSingleUse = false) const; /// Helper wrapper around SimplifyDemandedVectorElts bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, DAGCombinerInfo &DCI) const; /// Determine which of the bits specified in Mask are known to be either zero /// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts /// argument allows us to only collect the known bits that are shared by the /// requested vector elements. virtual void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const; /// Determine which of the bits of FrameIndex \p FIOp are known to be 0. /// Default implementation computes low bits based on alignment /// information. This should preserve known bits passed into it. virtual void computeKnownBitsForFrameIndex(const SDValue FIOp, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const; /// This method can be implemented by targets that want to expose additional /// information about sign bits to the DAG Combiner. The DemandedElts /// argument allows us to only collect the minimum sign bits that are shared /// by the requested vector elements. virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const; /// Attempt to simplify any target nodes based on the demanded vector /// elements, returning true on success. Otherwise, analyze the expression and /// return a mask of KnownUndef and KnownZero elements for the expression /// (used to simplify the caller). The KnownUndef/Zero elements may only be /// accurate for those bits in the DemandedMask virtual bool SimplifyDemandedVectorEltsForTargetNode( SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const; struct DAGCombinerInfo { void *DC; // The DAG Combiner object. CombineLevel Level; bool CalledByLegalizer; public: SelectionDAG &DAG; DAGCombinerInfo(SelectionDAG &dag, CombineLevel level, bool cl, void *dc) : DC(dc), Level(level), CalledByLegalizer(cl), DAG(dag) {} bool isBeforeLegalize() const { return Level == BeforeLegalizeTypes; } bool isBeforeLegalizeOps() const { return Level < AfterLegalizeVectorOps; } bool isAfterLegalizeDAG() const { return Level == AfterLegalizeDAG; } CombineLevel getDAGCombineLevel() { return Level; } bool isCalledByLegalizer() const { return CalledByLegalizer; } void AddToWorklist(SDNode *N); SDValue CombineTo(SDNode *N, ArrayRef To, bool AddTo = true); SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true); SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true); void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO); }; /// Return if the N is a constant or constant vector equal to the true value /// from getBooleanContents(). bool isConstTrueVal(const SDNode *N) const; /// Return if the N is a constant or constant vector equal to the false value /// from getBooleanContents(). bool isConstFalseVal(const SDNode *N) const; /// Return if \p N is a True value when extended to \p VT. bool isExtendedTrueVal(const ConstantSDNode *N, EVT VT, bool SExt) const; /// Try to simplify a setcc built with the specified operands and cc. If it is /// unable to simplify it, return a null SDValue. SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, bool foldBooleans, DAGCombinerInfo &DCI, const SDLoc &dl) const; // For targets which wrap address, unwrap for analysis. virtual SDValue unwrapAddress(SDValue N) const { return N; } /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. virtual bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const; /// This method will be invoked for all target nodes and for any /// target-independent nodes that the target has registered with invoke it /// for. /// /// The semantics are as follows: /// Return Value: /// SDValue.Val == 0 - No change was made /// SDValue.Val == N - N was replaced, is dead, and is already handled. /// otherwise - N should be replaced by the returned Operand. /// /// In addition, methods provided by DAGCombinerInfo may be used to perform /// more complex transformations. /// virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; - /// Return true if it is profitable to move a following shift through this - // node, adjusting any immediate operands as necessary to preserve semantics. - // This transformation may not be desirable if it disrupts a particularly - // auspicious target-specific tree (e.g. bitfield extraction in AArch64). - // By default, it returns true. - virtual bool isDesirableToCommuteWithShift(const SDNode *N) const { + /// Return true if it is profitable to move this shift by a constant amount + /// though its operand, adjusting any immediate operands as necessary to + /// preserve semantics. This transformation may not be desirable if it + /// disrupts a particularly auspicious target-specific tree (e.g. bitfield + /// extraction in AArch64). By default, it returns true. + /// + /// @param N the shift node + /// @param Level the current DAGCombine legalization level. + virtual bool isDesirableToCommuteWithShift(const SDNode *N, + CombineLevel Level) const { return true; } // Return true if it is profitable to combine a BUILD_VECTOR with a stride-pattern // to a shuffle and a truncate. // Example of such a combine: // v4i32 build_vector((extract_elt V, 1), // (extract_elt V, 3), // (extract_elt V, 5), // (extract_elt V, 7)) // --> // v4i32 truncate (bitcast (shuffle<1,u,3,u,5,u,7,u> V, u) to v4i64) virtual bool isDesirableToCombineBuildVectorToShuffleTruncate( ArrayRef ShuffleMask, EVT SrcVT, EVT TruncVT) const { return false; } /// Return true if the target has native support for the specified value type /// and it is 'desirable' to use the type for the given node type. e.g. On x86 /// i16 is legal, but undesirable since i16 instruction encodings are longer /// and some i16 instructions are slow. virtual bool isTypeDesirableForOp(unsigned /*Opc*/, EVT VT) const { // By default, assume all legal types are desirable. return isTypeLegal(VT); } /// Return true if it is profitable for dag combiner to transform a floating /// point op of specified opcode to a equivalent op of an integer /// type. e.g. f32 load -> i32 load can be profitable on ARM. virtual bool isDesirableToTransformToIntegerOp(unsigned /*Opc*/, EVT /*VT*/) const { return false; } /// This method query the target whether it is beneficial for dag combiner to /// promote the specified node. If true, it should return the desired /// promotion type by reference. virtual bool IsDesirableToPromoteOp(SDValue /*Op*/, EVT &/*PVT*/) const { return false; } /// Return true if the target supports swifterror attribute. It optimizes /// loads and stores to reading and writing a specific register. virtual bool supportSwiftError() const { return false; } /// Return true if the target supports that a subset of CSRs for the given /// machine function is handled explicitly via copies. virtual bool supportSplitCSR(MachineFunction *MF) const { return false; } /// Perform necessary initialization to handle a subset of CSRs explicitly /// via copies. This function is called at the beginning of instruction /// selection. virtual void initializeSplitCSR(MachineBasicBlock *Entry) const { llvm_unreachable("Not Implemented"); } /// Insert explicit copies in entry and exit blocks. We copy a subset of /// CSRs to virtual registers in the entry block, and copy them back to /// physical registers in the exit blocks. This function is called at the end /// of instruction selection. virtual void insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { llvm_unreachable("Not Implemented"); } //===--------------------------------------------------------------------===// // Lowering methods - These methods must be implemented by targets so that // the SelectionDAGBuilder code knows how to lower these. // /// This hook must be implemented to lower the incoming (formal) arguments, /// described by the Ins array, into the specified DAG. The implementation /// should fill in the InVals array with legal-type argument values, and /// return the resulting token chain value. virtual SDValue LowerFormalArguments( SDValue /*Chain*/, CallingConv::ID /*CallConv*/, bool /*isVarArg*/, const SmallVectorImpl & /*Ins*/, const SDLoc & /*dl*/, SelectionDAG & /*DAG*/, SmallVectorImpl & /*InVals*/) const { llvm_unreachable("Not Implemented"); } /// This structure contains all information that is necessary for lowering /// calls. It is passed to TLI::LowerCallTo when the SelectionDAG builder /// needs to lower a call, and targets will see this struct in their LowerCall /// implementation. struct CallLoweringInfo { SDValue Chain; Type *RetTy = nullptr; bool RetSExt : 1; bool RetZExt : 1; bool IsVarArg : 1; bool IsInReg : 1; bool DoesNotReturn : 1; bool IsReturnValueUsed : 1; bool IsConvergent : 1; bool IsPatchPoint : 1; // IsTailCall should be modified by implementations of // TargetLowering::LowerCall that perform tail call conversions. bool IsTailCall = false; // Is Call lowering done post SelectionDAG type legalization. bool IsPostTypeLegalization = false; unsigned NumFixedArgs = -1; CallingConv::ID CallConv = CallingConv::C; SDValue Callee; ArgListTy Args; SelectionDAG &DAG; SDLoc DL; ImmutableCallSite CS; SmallVector Outs; SmallVector OutVals; SmallVector Ins; SmallVector InVals; CallLoweringInfo(SelectionDAG &DAG) : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false), IsPatchPoint(false), DAG(DAG) {} CallLoweringInfo &setDebugLoc(const SDLoc &dl) { DL = dl; return *this; } CallLoweringInfo &setChain(SDValue InChain) { Chain = InChain; return *this; } // setCallee with target/module-specific attributes CallLoweringInfo &setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList) { RetTy = ResultType; Callee = Target; CallConv = CC; NumFixedArgs = ArgsList.size(); Args = std::move(ArgsList); DAG.getTargetLoweringInfo().markLibCallAttributes( &(DAG.getMachineFunction()), CC, Args); return *this; } CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList) { RetTy = ResultType; Callee = Target; CallConv = CC; NumFixedArgs = ArgsList.size(); Args = std::move(ArgsList); return *this; } CallLoweringInfo &setCallee(Type *ResultType, FunctionType *FTy, SDValue Target, ArgListTy &&ArgsList, ImmutableCallSite Call) { RetTy = ResultType; IsInReg = Call.hasRetAttr(Attribute::InReg); DoesNotReturn = Call.doesNotReturn() || (!Call.isInvoke() && isa(Call.getInstruction()->getNextNode())); IsVarArg = FTy->isVarArg(); IsReturnValueUsed = !Call.getInstruction()->use_empty(); RetSExt = Call.hasRetAttr(Attribute::SExt); RetZExt = Call.hasRetAttr(Attribute::ZExt); Callee = Target; CallConv = Call.getCallingConv(); NumFixedArgs = FTy->getNumParams(); Args = std::move(ArgsList); CS = Call; return *this; } CallLoweringInfo &setInRegister(bool Value = true) { IsInReg = Value; return *this; } CallLoweringInfo &setNoReturn(bool Value = true) { DoesNotReturn = Value; return *this; } CallLoweringInfo &setVarArg(bool Value = true) { IsVarArg = Value; return *this; } CallLoweringInfo &setTailCall(bool Value = true) { IsTailCall = Value; return *this; } CallLoweringInfo &setDiscardResult(bool Value = true) { IsReturnValueUsed = !Value; return *this; } CallLoweringInfo &setConvergent(bool Value = true) { IsConvergent = Value; return *this; } CallLoweringInfo &setSExtResult(bool Value = true) { RetSExt = Value; return *this; } CallLoweringInfo &setZExtResult(bool Value = true) { RetZExt = Value; return *this; } CallLoweringInfo &setIsPatchPoint(bool Value = true) { IsPatchPoint = Value; return *this; } CallLoweringInfo &setIsPostTypeLegalization(bool Value=true) { IsPostTypeLegalization = Value; return *this; } ArgListTy &getArgs() { return Args; } }; /// This function lowers an abstract call to a function into an actual call. /// This returns a pair of operands. The first element is the return value /// for the function (if RetTy is not VoidTy). The second element is the /// outgoing token chain. It calls LowerCall to do the actual lowering. std::pair LowerCallTo(CallLoweringInfo &CLI) const; /// This hook must be implemented to lower calls into the specified /// DAG. The outgoing arguments to the call are described by the Outs array, /// and the values to be returned by the call are described by the Ins /// array. The implementation should fill in the InVals array with legal-type /// return values from the call, and return the resulting token chain value. virtual SDValue LowerCall(CallLoweringInfo &/*CLI*/, SmallVectorImpl &/*InVals*/) const { llvm_unreachable("Not Implemented"); } /// Target-specific cleanup for formal ByVal parameters. virtual void HandleByVal(CCState *, unsigned &, unsigned) const {} /// This hook should be implemented to check whether the return values /// described by the Outs array can fit into the return registers. If false /// is returned, an sret-demotion is performed. virtual bool CanLowerReturn(CallingConv::ID /*CallConv*/, MachineFunction &/*MF*/, bool /*isVarArg*/, const SmallVectorImpl &/*Outs*/, LLVMContext &/*Context*/) const { // Return true by default to get preexisting behavior. return true; } /// This hook must be implemented to lower outgoing return values, described /// by the Outs array, into the specified DAG. The implementation should /// return the resulting token chain value. virtual SDValue LowerReturn(SDValue /*Chain*/, CallingConv::ID /*CallConv*/, bool /*isVarArg*/, const SmallVectorImpl & /*Outs*/, const SmallVectorImpl & /*OutVals*/, const SDLoc & /*dl*/, SelectionDAG & /*DAG*/) const { llvm_unreachable("Not Implemented"); } /// Return true if result of the specified node is used by a return node /// only. It also compute and return the input chain for the tail call. /// /// This is used to determine whether it is possible to codegen a libcall as /// tail call at legalization time. virtual bool isUsedByReturnOnly(SDNode *, SDValue &/*Chain*/) const { return false; } /// Return true if the target may be able emit the call instruction as a tail /// call. This is used by optimization passes to determine if it's profitable /// to duplicate return instructions to enable tailcall optimization. virtual bool mayBeEmittedAsTailCall(const CallInst *) const { return false; } /// Return the builtin name for the __builtin___clear_cache intrinsic /// Default is to invoke the clear cache library call virtual const char * getClearCacheBuiltinName() const { return "__clear_cache"; } /// Return the register ID of the name passed in. Used by named register /// global variables extension. There is no target-independent behaviour /// so the default action is to bail. virtual unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { report_fatal_error("Named registers not implemented for this target"); } /// Return the type that should be used to zero or sign extend a /// zeroext/signext integer return value. FIXME: Some C calling conventions /// require the return type to be promoted, but this is not true all the time, /// e.g. i1/i8/i16 on x86/x86_64. It is also not necessary for non-C calling /// conventions. The frontend should handle this and include all of the /// necessary information. virtual EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType /*ExtendKind*/) const { EVT MinVT = getRegisterType(Context, MVT::i32); return VT.bitsLT(MinVT) ? MinVT : VT; } /// For some targets, an LLVM struct type must be broken down into multiple /// simple types, but the calling convention specifies that the entire struct /// must be passed in a block of consecutive registers. virtual bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { return false; } /// Returns a 0 terminated array of registers that can be safely used as /// scratch registers. virtual const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const { return nullptr; } /// This callback is used to prepare for a volatile or atomic load. /// It takes a chain node as input and returns the chain for the load itself. /// /// Having a callback like this is necessary for targets like SystemZ, /// which allows a CPU to reuse the result of a previous load indefinitely, /// even if a cache-coherent store is performed by another CPU. The default /// implementation does nothing. virtual SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL, SelectionDAG &DAG) const { return Chain; } /// This callback is used to inspect load/store instructions and add /// target-specific MachineMemOperand flags to them. The default /// implementation does nothing. virtual MachineMemOperand::Flags getMMOFlags(const Instruction &I) const { return MachineMemOperand::MONone; } /// This callback is invoked by the type legalizer to legalize nodes with an /// illegal operand type but legal result types. It replaces the /// LowerOperation callback in the type Legalizer. The reason we can not do /// away with LowerOperation entirely is that LegalizeDAG isn't yet ready to /// use this callback. /// /// TODO: Consider merging with ReplaceNodeResults. /// /// The target places new result values for the node in Results (their number /// and types must exactly match those of the original return values of /// the node), or leaves Results empty, which indicates that the node is not /// to be custom lowered after all. /// The default implementation calls LowerOperation. virtual void LowerOperationWrapper(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const; /// This callback is invoked for operations that are unsupported by the /// target, which are registered to use 'custom' lowering, and whose defined /// values are all legal. If the target has no operations that require custom /// lowering, it need not implement this. The default implementation of this /// aborts. virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const; /// This callback is invoked when a node result type is illegal for the /// target, and the operation was registered to use 'custom' lowering for that /// result type. The target places new result values for the node in Results /// (their number and types must exactly match those of the original return /// values of the node), or leaves Results empty, which indicates that the /// node is not to be custom lowered after all. /// /// If the target has no operations that require custom lowering, it need not /// implement this. The default implementation aborts. virtual void ReplaceNodeResults(SDNode * /*N*/, SmallVectorImpl &/*Results*/, SelectionDAG &/*DAG*/) const { llvm_unreachable("ReplaceNodeResults not implemented for this target!"); } /// This method returns the name of a target specific DAG node. virtual const char *getTargetNodeName(unsigned Opcode) const; /// This method returns a target specific FastISel object, or null if the /// target does not support "fast" ISel. virtual FastISel *createFastISel(FunctionLoweringInfo &, const TargetLibraryInfo *) const { return nullptr; } bool verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const; //===--------------------------------------------------------------------===// // Inline Asm Support hooks // /// This hook allows the target to expand an inline asm call to be explicit /// llvm code if it wants to. This is useful for turning simple inline asms /// into LLVM intrinsics, which gives the compiler more information about the /// behavior of the code. virtual bool ExpandInlineAsm(CallInst *) const { return false; } enum ConstraintType { C_Register, // Constraint represents specific register(s). C_RegisterClass, // Constraint represents any of register(s) in class. C_Memory, // Memory constraint. C_Other, // Something else. C_Unknown // Unsupported constraint. }; enum ConstraintWeight { // Generic weights. CW_Invalid = -1, // No match. CW_Okay = 0, // Acceptable. CW_Good = 1, // Good weight. CW_Better = 2, // Better weight. CW_Best = 3, // Best weight. // Well-known weights. CW_SpecificReg = CW_Okay, // Specific register operands. CW_Register = CW_Good, // Register operands. CW_Memory = CW_Better, // Memory operands. CW_Constant = CW_Best, // Constant operand. CW_Default = CW_Okay // Default or don't know type. }; /// This contains information for each constraint that we are lowering. struct AsmOperandInfo : public InlineAsm::ConstraintInfo { /// This contains the actual string for the code, like "m". TargetLowering /// picks the 'best' code from ConstraintInfo::Codes that most closely /// matches the operand. std::string ConstraintCode; /// Information about the constraint code, e.g. Register, RegisterClass, /// Memory, Other, Unknown. TargetLowering::ConstraintType ConstraintType = TargetLowering::C_Unknown; /// If this is the result output operand or a clobber, this is null, /// otherwise it is the incoming operand to the CallInst. This gets /// modified as the asm is processed. Value *CallOperandVal = nullptr; /// The ValueType for the operand value. MVT ConstraintVT = MVT::Other; /// Copy constructor for copying from a ConstraintInfo. AsmOperandInfo(InlineAsm::ConstraintInfo Info) : InlineAsm::ConstraintInfo(std::move(Info)) {} /// Return true of this is an input operand that is a matching constraint /// like "4". bool isMatchingInputConstraint() const; /// If this is an input matching constraint, this method returns the output /// operand it matches. unsigned getMatchedOperand() const; }; using AsmOperandInfoVector = std::vector; /// Split up the constraint string from the inline assembly value into the /// specific constraints and their prefixes, and also tie in the associated /// operand values. If this returns an empty vector, and if the constraint /// string itself isn't empty, there was an error parsing. virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, ImmutableCallSite CS) const; /// Examine constraint type and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. virtual ConstraintWeight getMultipleConstraintMatchWeight( AsmOperandInfo &info, int maIndex) const; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. virtual ConstraintWeight getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const; /// Determines the constraint code and constraint type to use for the specific /// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType. /// If the actual operand being passed in is available, it can be passed in as /// Op, otherwise an empty SDValue can be passed. virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG = nullptr) const; /// Given a constraint, return the type of constraint it is for this target. virtual ConstraintType getConstraintType(StringRef Constraint) const; /// Given a physical register constraint (e.g. {edx}), return the register /// number and the register class for the register. /// /// Given a register class constraint, like 'r', if this corresponds directly /// to an LLVM register class, return a register of 0 and the register class /// pointer. /// /// This should only be used for C_Register constraints. On error, this /// returns a register number of 0 and a null register class pointer. virtual std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const; virtual unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const { if (ConstraintCode == "i") return InlineAsm::Constraint_i; else if (ConstraintCode == "m") return InlineAsm::Constraint_m; return InlineAsm::Constraint_Unknown; } /// Try to replace an X constraint, which matches anything, with another that /// has more specific requirements based on the type of the corresponding /// operand. This returns null if there is no replacement to make. virtual const char *LowerXConstraint(EVT ConstraintVT) const; /// Lower the specified operand into the Ops vector. If it is invalid, don't /// add anything to Ops. virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const; //===--------------------------------------------------------------------===// // Div utility functions // SDValue BuildSDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, bool IsAfterLegalization, SmallVectorImpl &Created) const; SDValue BuildUDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, bool IsAfterLegalization, SmallVectorImpl &Created) const; /// Targets may override this function to provide custom SDIV lowering for /// power-of-2 denominators. If the target returns an empty SDValue, LLVM /// assumes SDIV is expensive and replaces it with a series of other integer /// operations. virtual SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const; /// Indicate whether this target prefers to combine FDIVs with the same /// divisor. If the transform should never be done, return zero. If the /// transform should be done, return the minimum number of divisor uses /// that must exist. virtual unsigned combineRepeatedFPDivisors() const { return 0; } /// Hooks for building estimates in place of slower divisions and square /// roots. /// Return either a square root or its reciprocal estimate value for the input /// operand. /// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or /// 'Enabled' as set by a potential default override attribute. /// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson /// refinement iterations required to generate a sufficient (though not /// necessarily IEEE-754 compliant) estimate is returned in that parameter. /// The boolean UseOneConstNR output is used to select a Newton-Raphson /// algorithm implementation that uses either one or two constants. /// The boolean Reciprocal is used to select whether the estimate is for the /// square root of the input operand or the reciprocal of its square root. /// A target may choose to implement its own refinement within this function. /// If that's true, then return '0' as the number of RefinementSteps to avoid /// any further refinement of the estimate. /// An empty SDValue return means no estimate sequence can be created. virtual SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const { return SDValue(); } /// Return a reciprocal estimate value for the input operand. /// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or /// 'Enabled' as set by a potential default override attribute. /// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson /// refinement iterations required to generate a sufficient (though not /// necessarily IEEE-754 compliant) estimate is returned in that parameter. /// A target may choose to implement its own refinement within this function. /// If that's true, then return '0' as the number of RefinementSteps to avoid /// any further refinement of the estimate. /// An empty SDValue return means no estimate sequence can be created. virtual SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const { return SDValue(); } //===--------------------------------------------------------------------===// // Legalization utility functions // /// Expand a MUL or [US]MUL_LOHI of n-bit values into two or four nodes, /// respectively, each computing an n/2-bit part of the result. /// \param Result A vector that will be filled with the parts of the result /// in little-endian order. /// \param LL Low bits of the LHS of the MUL. You can use this parameter /// if you want to control how low bits are extracted from the LHS. /// \param LH High bits of the LHS of the MUL. See LL for meaning. /// \param RL Low bits of the RHS of the MUL. See LL for meaning /// \param RH High bits of the RHS of the MUL. See LL for meaning. /// \returns true if the node has been expanded, false if it has not bool expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, SDValue LHS, SDValue RHS, SmallVectorImpl &Result, EVT HiLoVT, SelectionDAG &DAG, MulExpansionKind Kind, SDValue LL = SDValue(), SDValue LH = SDValue(), SDValue RL = SDValue(), SDValue RH = SDValue()) const; /// Expand a MUL into two nodes. One that computes the high bits of /// the result and one that computes the low bits. /// \param HiLoVT The value type to use for the Lo and Hi nodes. /// \param LL Low bits of the LHS of the MUL. You can use this parameter /// if you want to control how low bits are extracted from the LHS. /// \param LH High bits of the LHS of the MUL. See LL for meaning. /// \param RL Low bits of the RHS of the MUL. See LL for meaning /// \param RH High bits of the RHS of the MUL. See LL for meaning. /// \returns true if the node has been expanded. false if it has not bool expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT, SelectionDAG &DAG, MulExpansionKind Kind, SDValue LL = SDValue(), SDValue LH = SDValue(), SDValue RL = SDValue(), SDValue RH = SDValue()) const; /// Expand float(f32) to SINT(i64) conversion /// \param N Node to expand /// \param Result output after conversion /// \returns True, if the expansion was successful, false otherwise bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const; /// Turn load of vector type into a load of the individual elements. /// \param LD load to expand /// \returns MERGE_VALUEs of the scalar loads with their chains. SDValue scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const; // Turn a store of a vector type into stores of the individual elements. /// \param ST Store with a vector value type /// \returns MERGE_VALUs of the individual store chains. SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const; /// Expands an unaligned load to 2 half-size loads for an integer, and /// possibly more for vectors. std::pair expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const; /// Expands an unaligned store to 2 half-size stores for integer values, and /// possibly more for vectors. SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const; /// Increments memory address \p Addr according to the type of the value /// \p DataVT that should be stored. If the data is stored in compressed /// form, the memory address should be incremented according to the number of /// the stored elements. This number is equal to the number of '1's bits /// in the \p Mask. /// \p DataVT is a vector type. \p Mask is a vector value. /// \p DataVT and \p Mask have the same number of vector elements. SDValue IncrementMemoryAddress(SDValue Addr, SDValue Mask, const SDLoc &DL, EVT DataVT, SelectionDAG &DAG, bool IsCompressedMemory) const; /// Get a pointer to vector element \p Idx located in memory for a vector of /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of /// bounds the returned pointer is unspecified, but will be within the vector /// bounds. SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT, SDValue Index) const; //===--------------------------------------------------------------------===// // Instruction Emitting Hooks // /// This method should be implemented by targets that mark instructions with /// the 'usesCustomInserter' flag. These instructions are special in various /// ways, which require special support to insert. The specified MachineInstr /// is created but not inserted into any basic blocks, and this method is /// called to expand it into a sequence of instructions, potentially also /// creating new basic blocks and control flow. /// As long as the returned basic block is different (i.e., we created a new /// one), the custom inserter is free to modify the rest of \p MBB. virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const; /// This method should be implemented by targets that mark instructions with /// the 'hasPostISelHook' flag. These instructions must be adjusted after /// instruction selection by target hooks. e.g. To fill in optional defs for /// ARM 's' setting instructions. virtual void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const; /// If this function returns true, SelectionDAGBuilder emits a /// LOAD_STACK_GUARD node when it is lowering Intrinsic::stackprotector. virtual bool useLoadStackGuardNode() const { return false; } virtual SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, const SDLoc &DL) const { llvm_unreachable("not implemented for this target"); } /// Lower TLS global address SDNode for target independent emulated TLS model. virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const; /// Expands target specific indirect branch for the case of JumpTable /// expanasion. virtual SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, SDValue Addr, SelectionDAG &DAG) const { return DAG.getNode(ISD::BRIND, dl, MVT::Other, Value, Addr); } // seteq(x, 0) -> truncate(srl(ctlz(zext(x)), log2(#bits))) // If we're comparing for equality to zero and isCtlzFast is true, expose the // fact that this can be implemented as a ctlz/srl pair, so that the dag // combiner can fold the new nodes. SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const; private: SDValue simplifySetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL) const; SDValue optimizeSetCCOfSignedTruncationCheck(EVT SCCVT, SDValue N0, SDValue N1, ISD::CondCode Cond, DAGCombinerInfo &DCI, const SDLoc &DL) const; }; /// Given an LLVM IR type and return type attributes, compute the return value /// EVTs and flags, and optionally also the offsets, if the return value is /// being lowered to memory. void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr, SmallVectorImpl &Outs, const TargetLowering &TLI, const DataLayout &DL); } // end namespace llvm #endif // LLVM_CODEGEN_TARGETLOWERING_H Index: head/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- head/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (revision 344055) +++ head/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (revision 344056) @@ -1,18691 +1,18692 @@ //===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run // both before and after the DAG is legalized. // // This pass is not a substitute for the LLVM IR instcombine pass. This pass is // primarily intended to handle simplification opportunities that are implicit // in the LLVM IR and exposed by the various codegen lowering phases. // //===----------------------------------------------------------------------===// #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include #include #include #include #include using namespace llvm; #define DEBUG_TYPE "dagcombine" STATISTIC(NodesCombined , "Number of dag nodes combined"); STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created"); STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created"); STATISTIC(OpsNarrowed , "Number of load/op/store narrowed"); STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int"); STATISTIC(SlicedLoads, "Number of load sliced"); static cl::opt CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden, cl::desc("Enable DAG combiner's use of IR alias analysis")); static cl::opt UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true), cl::desc("Enable DAG combiner's use of TBAA")); #ifndef NDEBUG static cl::opt CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden, cl::desc("Only use DAG-combiner alias analysis in this" " function")); #endif /// Hidden option to stress test load slicing, i.e., when this option /// is enabled, load slicing bypasses most of its profitability guards. static cl::opt StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden, cl::desc("Bypass the profitability model of load slicing"), cl::init(false)); static cl::opt MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true), cl::desc("DAG combiner may split indexing from loads")); namespace { class DAGCombiner { SelectionDAG &DAG; const TargetLowering &TLI; CombineLevel Level; CodeGenOpt::Level OptLevel; bool LegalOperations = false; bool LegalTypes = false; bool ForCodeSize; /// Worklist of all of the nodes that need to be simplified. /// /// This must behave as a stack -- new nodes to process are pushed onto the /// back and when processing we pop off of the back. /// /// The worklist will not contain duplicates but may contain null entries /// due to nodes being deleted from the underlying DAG. SmallVector Worklist; /// Mapping from an SDNode to its position on the worklist. /// /// This is used to find and remove nodes from the worklist (by nulling /// them) when they are deleted from the underlying DAG. It relies on /// stable indices of nodes within the worklist. DenseMap WorklistMap; /// Set of nodes which have been combined (at least once). /// /// This is used to allow us to reliably add any operands of a DAG node /// which have not yet been combined to the worklist. SmallPtrSet CombinedNodes; // AA - Used for DAG load/store alias analysis. AliasAnalysis *AA; /// When an instruction is simplified, add all users of the instruction to /// the work lists because they might get more simplified now. void AddUsersToWorklist(SDNode *N) { for (SDNode *Node : N->uses()) AddToWorklist(Node); } /// Call the node-specific routine that folds each particular type of node. SDValue visit(SDNode *N); public: DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) { ForCodeSize = DAG.getMachineFunction().getFunction().optForSize(); MaximumLegalStoreInBits = 0; for (MVT VT : MVT::all_valuetypes()) if (EVT(VT).isSimple() && VT != MVT::Other && TLI.isTypeLegal(EVT(VT)) && VT.getSizeInBits() >= MaximumLegalStoreInBits) MaximumLegalStoreInBits = VT.getSizeInBits(); } /// Add to the worklist making sure its instance is at the back (next to be /// processed.) void AddToWorklist(SDNode *N) { assert(N->getOpcode() != ISD::DELETED_NODE && "Deleted Node added to Worklist"); // Skip handle nodes as they can't usefully be combined and confuse the // zero-use deletion strategy. if (N->getOpcode() == ISD::HANDLENODE) return; if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second) Worklist.push_back(N); } /// Remove all instances of N from the worklist. void removeFromWorklist(SDNode *N) { CombinedNodes.erase(N); auto It = WorklistMap.find(N); if (It == WorklistMap.end()) return; // Not in the worklist. // Null out the entry rather than erasing it to avoid a linear operation. Worklist[It->second] = nullptr; WorklistMap.erase(It); } void deleteAndRecombine(SDNode *N); bool recursivelyDeleteUnusedNodes(SDNode *N); /// Replaces all uses of the results of one DAG node with new values. SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo = true); /// Replaces all uses of the results of one DAG node with new values. SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) { return CombineTo(N, &Res, 1, AddTo); } /// Replaces all uses of the results of one DAG node with new values. SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true) { SDValue To[] = { Res0, Res1 }; return CombineTo(N, To, 2, AddTo); } void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO); private: unsigned MaximumLegalStoreInBits; /// Check the specified integer node value to see if it can be simplified or /// if things it uses can be simplified by bit propagation. /// If so, return true. bool SimplifyDemandedBits(SDValue Op) { unsigned BitWidth = Op.getScalarValueSizeInBits(); APInt Demanded = APInt::getAllOnesValue(BitWidth); return SimplifyDemandedBits(Op, Demanded); } /// Check the specified vector node value to see if it can be simplified or /// if things it uses can be simplified as it only uses some of the /// elements. If so, return true. bool SimplifyDemandedVectorElts(SDValue Op) { unsigned NumElts = Op.getValueType().getVectorNumElements(); APInt Demanded = APInt::getAllOnesValue(NumElts); return SimplifyDemandedVectorElts(Op, Demanded); } bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded); bool SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded, bool AssumeSingleUse = false); bool CombineToPreIndexedLoadStore(SDNode *N); bool CombineToPostIndexedLoadStore(SDNode *N); SDValue SplitIndexingFromLoad(LoadSDNode *LD); bool SliceUpLoad(SDNode *N); /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed /// load. /// /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced. /// \param InVecVT type of the input vector to EVE with bitcasts resolved. /// \param EltNo index of the vector element to load. /// \param OriginalLoad load that EVE came from to be replaced. /// \returns EVE on success SDValue() on failure. SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad( SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad); void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad); SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace); SDValue SExtPromoteOperand(SDValue Op, EVT PVT); SDValue ZExtPromoteOperand(SDValue Op, EVT PVT); SDValue PromoteIntBinOp(SDValue Op); SDValue PromoteIntShiftOp(SDValue Op); SDValue PromoteExtend(SDValue Op); bool PromoteLoad(SDValue Op); /// Call the node-specific routine that knows how to fold each /// particular type of node. If that doesn't do anything, try the /// target-specific DAG combines. SDValue combine(SDNode *N); // Visitation implementation - Implement dag node combining for different // node types. The semantics are as follows: // Return Value: // SDValue.getNode() == 0 - No change was made // SDValue.getNode() == N - N was replaced, is dead and has been handled. // otherwise - N should be replaced by the returned Operand. // SDValue visitTokenFactor(SDNode *N); SDValue visitMERGE_VALUES(SDNode *N); SDValue visitADD(SDNode *N); SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference); SDValue visitSUB(SDNode *N); SDValue visitADDC(SDNode *N); SDValue visitUADDO(SDNode *N); SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N); SDValue visitSUBC(SDNode *N); SDValue visitUSUBO(SDNode *N); SDValue visitADDE(SDNode *N); SDValue visitADDCARRY(SDNode *N); SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N); SDValue visitSUBE(SDNode *N); SDValue visitSUBCARRY(SDNode *N); SDValue visitMUL(SDNode *N); SDValue useDivRem(SDNode *N); SDValue visitSDIV(SDNode *N); SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N); SDValue visitUDIV(SDNode *N); SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N); SDValue visitREM(SDNode *N); SDValue visitMULHU(SDNode *N); SDValue visitMULHS(SDNode *N); SDValue visitSMUL_LOHI(SDNode *N); SDValue visitUMUL_LOHI(SDNode *N); SDValue visitSMULO(SDNode *N); SDValue visitUMULO(SDNode *N); SDValue visitIMINMAX(SDNode *N); SDValue visitAND(SDNode *N); SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N); SDValue visitOR(SDNode *N); SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N); SDValue visitXOR(SDNode *N); SDValue SimplifyVBinOp(SDNode *N); SDValue visitSHL(SDNode *N); SDValue visitSRA(SDNode *N); SDValue visitSRL(SDNode *N); SDValue visitRotate(SDNode *N); SDValue visitABS(SDNode *N); SDValue visitBSWAP(SDNode *N); SDValue visitBITREVERSE(SDNode *N); SDValue visitCTLZ(SDNode *N); SDValue visitCTLZ_ZERO_UNDEF(SDNode *N); SDValue visitCTTZ(SDNode *N); SDValue visitCTTZ_ZERO_UNDEF(SDNode *N); SDValue visitCTPOP(SDNode *N); SDValue visitSELECT(SDNode *N); SDValue visitVSELECT(SDNode *N); SDValue visitSELECT_CC(SDNode *N); SDValue visitSETCC(SDNode *N); SDValue visitSETCCCARRY(SDNode *N); SDValue visitSIGN_EXTEND(SDNode *N); SDValue visitZERO_EXTEND(SDNode *N); SDValue visitANY_EXTEND(SDNode *N); SDValue visitAssertExt(SDNode *N); SDValue visitSIGN_EXTEND_INREG(SDNode *N); SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N); SDValue visitTRUNCATE(SDNode *N); SDValue visitBITCAST(SDNode *N); SDValue visitBUILD_PAIR(SDNode *N); SDValue visitFADD(SDNode *N); SDValue visitFSUB(SDNode *N); SDValue visitFMUL(SDNode *N); SDValue visitFMA(SDNode *N); SDValue visitFDIV(SDNode *N); SDValue visitFREM(SDNode *N); SDValue visitFSQRT(SDNode *N); SDValue visitFCOPYSIGN(SDNode *N); SDValue visitSINT_TO_FP(SDNode *N); SDValue visitUINT_TO_FP(SDNode *N); SDValue visitFP_TO_SINT(SDNode *N); SDValue visitFP_TO_UINT(SDNode *N); SDValue visitFP_ROUND(SDNode *N); SDValue visitFP_ROUND_INREG(SDNode *N); SDValue visitFP_EXTEND(SDNode *N); SDValue visitFNEG(SDNode *N); SDValue visitFABS(SDNode *N); SDValue visitFCEIL(SDNode *N); SDValue visitFTRUNC(SDNode *N); SDValue visitFFLOOR(SDNode *N); SDValue visitFMINNUM(SDNode *N); SDValue visitFMAXNUM(SDNode *N); SDValue visitBRCOND(SDNode *N); SDValue visitBR_CC(SDNode *N); SDValue visitLOAD(SDNode *N); SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain); SDValue replaceStoreOfFPConstant(StoreSDNode *ST); SDValue visitSTORE(SDNode *N); SDValue visitINSERT_VECTOR_ELT(SDNode *N); SDValue visitEXTRACT_VECTOR_ELT(SDNode *N); SDValue visitBUILD_VECTOR(SDNode *N); SDValue visitCONCAT_VECTORS(SDNode *N); SDValue visitEXTRACT_SUBVECTOR(SDNode *N); SDValue visitVECTOR_SHUFFLE(SDNode *N); SDValue visitSCALAR_TO_VECTOR(SDNode *N); SDValue visitINSERT_SUBVECTOR(SDNode *N); SDValue visitMLOAD(SDNode *N); SDValue visitMSTORE(SDNode *N); SDValue visitMGATHER(SDNode *N); SDValue visitMSCATTER(SDNode *N); SDValue visitFP_TO_FP16(SDNode *N); SDValue visitFP16_TO_FP(SDNode *N); SDValue visitFADDForFMACombine(SDNode *N); SDValue visitFSUBForFMACombine(SDNode *N); SDValue visitFMULForFMADistributiveCombine(SDNode *N); SDValue XformToShuffleWithZero(SDNode *N); SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1); SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt); SDValue foldSelectOfConstants(SDNode *N); SDValue foldVSelectOfConstants(SDNode *N); SDValue foldBinOpIntoSelect(SDNode *BO); bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS); SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N); SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2); SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC, bool NotExtCompare = false); SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC); SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, const SDLoc &DL); SDValue unfoldMaskedMerge(SDNode *N); SDValue unfoldExtremeBitClearingToShifts(SDNode *N); SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans); SDValue rebuildSetCC(SDValue N); bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, SDValue &CC) const; bool isOneUseSetCC(SDValue N) const; SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp); SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); SDValue CombineExtLoad(SDNode *N); SDValue CombineZExtLogicopShiftLoad(SDNode *N); SDValue combineRepeatedFPDivisors(SDNode *N); SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex); SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); SDValue BuildUDIV(SDNode *N); SDValue BuildLogBase2(SDValue V, const SDLoc &DL); SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags); SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags); SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags); SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip); SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations, SDNodeFlags Flags, bool Reciprocal); SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations, SDNodeFlags Flags, bool Reciprocal); SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits = true); SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1); SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, SDValue InnerPos, SDValue InnerNeg, unsigned PosOpcode, unsigned NegOpcode, const SDLoc &DL); SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL); SDValue MatchLoadCombine(SDNode *N); SDValue ReduceLoadWidth(SDNode *N); SDValue ReduceLoadOpStoreWidth(SDNode *N); SDValue splitMergedValStore(StoreSDNode *ST); SDValue TransformFPLoadStorePair(SDNode *N); SDValue convertBuildVecZextToZext(SDNode *N); SDValue reduceBuildVecExtToExtBuildVec(SDNode *N); SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N); SDValue reduceBuildVecToShuffle(SDNode *N); SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef VectorMask, SDValue VecIn1, SDValue VecIn2, unsigned LeftIdx); SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast); /// Walk up chain skipping non-aliasing memory nodes, /// looking for aliasing nodes and adding them to the Aliases vector. void GatherAllAliases(SDNode *N, SDValue OriginalChain, SmallVectorImpl &Aliases); /// Return true if there is any possibility that the two addresses overlap. bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const; /// Walk up chain skipping non-aliasing memory nodes, looking for a better /// chain (aliasing node.) SDValue FindBetterChain(SDNode *N, SDValue Chain); /// Try to replace a store and any possibly adjacent stores on /// consecutive chains with better chains. Return true only if St is /// replaced. /// /// Notice that other chains may still be replaced even if the function /// returns false. bool findBetterNeighborChains(StoreSDNode *St); /// Holds a pointer to an LSBaseSDNode as well as information on where it /// is located in a sequence of memory operations connected by a chain. struct MemOpLink { // Ptr to the mem node. LSBaseSDNode *MemNode; // Offset from the base ptr. int64_t OffsetFromBase; MemOpLink(LSBaseSDNode *N, int64_t Offset) : MemNode(N), OffsetFromBase(Offset) {} }; /// This is a helper function for visitMUL to check the profitability /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). /// MulNode is the original multiply, AddNode is (add x, c1), /// and ConstNode is c2. bool isMulAddWithConstProfitable(SDNode *MulNode, SDValue &AddNode, SDValue &ConstNode); /// This is a helper function for visitAND and visitZERO_EXTEND. Returns /// true if the (and (load x) c) pattern matches an extload. ExtVT returns /// the type of the loaded value to be extended. bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, EVT LoadResultTy, EVT &ExtVT); /// Helper function to calculate whether the given Load/Store can have its /// width reduced to ExtVT. bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType, EVT &MemVT, unsigned ShAmt = 0); /// Used by BackwardsPropagateMask to find suitable loads. bool SearchForAndLoads(SDNode *N, SmallPtrSetImpl &Loads, SmallPtrSetImpl &NodesWithConsts, ConstantSDNode *Mask, SDNode *&NodeToMask); /// Attempt to propagate a given AND node back to load leaves so that they /// can be combined into narrow loads. bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG); /// Helper function for MergeConsecutiveStores which merges the /// component store chains. SDValue getMergeStoreChains(SmallVectorImpl &StoreNodes, unsigned NumStores); /// This is a helper function for MergeConsecutiveStores. When the /// source elements of the consecutive stores are all constants or /// all extracted vector elements, try to merge them into one /// larger store introducing bitcasts if necessary. \return True /// if a merged store was created. bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl &StoreNodes, EVT MemVT, unsigned NumStores, bool IsConstantSrc, bool UseVector, bool UseTrunc); /// This is a helper function for MergeConsecutiveStores. Stores /// that potentially may be merged with St are placed in /// StoreNodes. RootNode is a chain predecessor to all store /// candidates. void getStoreMergeCandidates(StoreSDNode *St, SmallVectorImpl &StoreNodes, SDNode *&Root); /// Helper function for MergeConsecutiveStores. Checks if /// candidate stores have indirect dependency through their /// operands. RootNode is the predecessor to all stores calculated /// by getStoreMergeCandidates and is used to prune the dependency check. /// \return True if safe to merge. bool checkMergeStoreCandidatesForDependencies( SmallVectorImpl &StoreNodes, unsigned NumStores, SDNode *RootNode); /// Merge consecutive store operations into a wide store. /// This optimization uses wide integers or vectors when possible. /// \return number of stores that were merged into a merged store (the /// affected nodes are stored as a prefix in \p StoreNodes). bool MergeConsecutiveStores(StoreSDNode *St); /// Try to transform a truncation where C is a constant: /// (trunc (and X, C)) -> (and (trunc X), (trunc C)) /// /// \p N needs to be a truncation and its first operand an AND. Other /// requirements are checked by the function (e.g. that trunc is /// single-use) and if missed an empty SDValue is returned. SDValue distributeTruncateThroughAnd(SDNode *N); /// Helper function to determine whether the target supports operation /// given by \p Opcode for type \p VT, that is, whether the operation /// is legal or custom before legalizing operations, and whether is /// legal (but not custom) after legalization. bool hasOperation(unsigned Opcode, EVT VT) { if (LegalOperations) return TLI.isOperationLegal(Opcode, VT); return TLI.isOperationLegalOrCustom(Opcode, VT); } public: /// Runs the dag combiner on all nodes in the work list void Run(CombineLevel AtLevel); SelectionDAG &getDAG() const { return DAG; } /// Returns a type large enough to hold any valid shift amount - before type /// legalization these can be huge. EVT getShiftAmountTy(EVT LHSTy) { assert(LHSTy.isInteger() && "Shift amount is not an integer type!"); return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes); } /// This method returns true if we are running before type legalization or /// if the specified VT is legal. bool isTypeLegal(const EVT &VT) { if (!LegalTypes) return true; return TLI.isTypeLegal(VT); } /// Convenience wrapper around TargetLowering::getSetCCResultType EVT getSetCCResultType(EVT VT) const { return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); } void ExtendSetCCUses(const SmallVectorImpl &SetCCs, SDValue OrigLoad, SDValue ExtLoad, ISD::NodeType ExtType); }; /// This class is a DAGUpdateListener that removes any deleted /// nodes from the worklist. class WorklistRemover : public SelectionDAG::DAGUpdateListener { DAGCombiner &DC; public: explicit WorklistRemover(DAGCombiner &dc) : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {} void NodeDeleted(SDNode *N, SDNode *E) override { DC.removeFromWorklist(N); } }; } // end anonymous namespace //===----------------------------------------------------------------------===// // TargetLowering::DAGCombinerInfo implementation //===----------------------------------------------------------------------===// void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) { ((DAGCombiner*)DC)->AddToWorklist(N); } SDValue TargetLowering::DAGCombinerInfo:: CombineTo(SDNode *N, ArrayRef To, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo); } SDValue TargetLowering::DAGCombinerInfo:: CombineTo(SDNode *N, SDValue Res, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo); } SDValue TargetLowering::DAGCombinerInfo:: CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) { return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo); } void TargetLowering::DAGCombinerInfo:: CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO); } //===----------------------------------------------------------------------===// // Helper Functions //===----------------------------------------------------------------------===// void DAGCombiner::deleteAndRecombine(SDNode *N) { removeFromWorklist(N); // If the operands of this node are only used by the node, they will now be // dead. Make sure to re-visit them and recursively delete dead nodes. for (const SDValue &Op : N->ops()) // For an operand generating multiple values, one of the values may // become dead allowing further simplification (e.g. split index // arithmetic from an indexed load). if (Op->hasOneUse() || Op->getNumValues() > 1) AddToWorklist(Op.getNode()); DAG.DeleteNode(N); } /// Return 1 if we can compute the negated form of the specified expression for /// the same cost as the expression itself, or 2 if we can compute the negated /// form more cheaply than the expression itself. static char isNegatibleForFree(SDValue Op, bool LegalOperations, const TargetLowering &TLI, const TargetOptions *Options, unsigned Depth = 0) { // fneg is removable even if it has multiple uses. if (Op.getOpcode() == ISD::FNEG) return 2; // Don't allow anything with multiple uses unless we know it is free. EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); if (!Op.hasOneUse()) if (!(Op.getOpcode() == ISD::FP_EXTEND && TLI.isFPExtFree(VT, Op.getOperand(0).getValueType()))) return 0; // Don't recurse exponentially. if (Depth > 6) return 0; switch (Op.getOpcode()) { default: return false; case ISD::ConstantFP: { if (!LegalOperations) return 1; // Don't invert constant FP values after legalization unless the target says // the negated constant is legal. return TLI.isOperationLegal(ISD::ConstantFP, VT) || TLI.isFPImmLegal(neg(cast(Op)->getValueAPF()), VT); } case ISD::FADD: if (!Options->UnsafeFPMath && !Flags.hasNoSignedZeros()) return 0; // After operation legalization, it might not be legal to create new FSUBs. if (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) return 0; // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, Depth + 1)) return V; // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options, Depth + 1); case ISD::FSUB: // We can't turn -(A-B) into B-A when we honor signed zeros. if (!Options->NoSignedZerosFPMath && !Flags.hasNoSignedZeros()) return 0; // fold (fneg (fsub A, B)) -> (fsub B, A) return 1; case ISD::FMUL: case ISD::FDIV: // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y)) if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, Depth + 1)) return V; return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options, Depth + 1); case ISD::FP_EXTEND: case ISD::FP_ROUND: case ISD::FSIN: return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options, Depth + 1); } } /// If isNegatibleForFree returns true, return the newly negated expression. static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, unsigned Depth = 0) { const TargetOptions &Options = DAG.getTarget().Options; // fneg is removable even if it has multiple uses. if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0); assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree"); const SDNodeFlags Flags = Op.getNode()->getFlags(); switch (Op.getOpcode()) { default: llvm_unreachable("Unknown code"); case ISD::ConstantFP: { APFloat V = cast(Op)->getValueAPF(); V.changeSign(); return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType()); } case ISD::FADD: assert(Options.UnsafeFPMath || Flags.hasNoSignedZeros()); // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) if (isNegatibleForFree(Op.getOperand(0), LegalOperations, DAG.getTargetLoweringInfo(), &Options, Depth+1)) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), Op.getOperand(1), Flags); // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(1), DAG, LegalOperations, Depth+1), Op.getOperand(0), Flags); case ISD::FSUB: // fold (fneg (fsub 0, B)) -> B if (ConstantFPSDNode *N0CFP = dyn_cast(Op.getOperand(0))) if (N0CFP->isZero()) return Op.getOperand(1); // fold (fneg (fsub A, B)) -> (fsub B, A) return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(0), Flags); case ISD::FMUL: case ISD::FDIV: // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) if (isNegatibleForFree(Op.getOperand(0), LegalOperations, DAG.getTargetLoweringInfo(), &Options, Depth+1)) return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), Op.getOperand(1), Flags); // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y)) return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0), GetNegatedExpression(Op.getOperand(1), DAG, LegalOperations, Depth+1), Flags); case ISD::FP_EXTEND: case ISD::FSIN: return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1)); case ISD::FP_ROUND: return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(), GetNegatedExpression(Op.getOperand(0), DAG, LegalOperations, Depth+1), Op.getOperand(1)); } } // APInts must be the same size for most operations, this helper // function zero extends the shorter of the pair so that they match. // We provide an Offset so that we can create bitwidths that won't overflow. static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) { unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth()); LHS = LHS.zextOrSelf(Bits); RHS = RHS.zextOrSelf(Bits); } // Return true if this node is a setcc, or is a select_cc // that selects between the target values used for true and false, making it // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to // the appropriate nodes based on the type of node we are checking. This // simplifies life a bit for the callers. bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS, SDValue &CC) const { if (N.getOpcode() == ISD::SETCC) { LHS = N.getOperand(0); RHS = N.getOperand(1); CC = N.getOperand(2); return true; } if (N.getOpcode() != ISD::SELECT_CC || !TLI.isConstTrueVal(N.getOperand(2).getNode()) || !TLI.isConstFalseVal(N.getOperand(3).getNode())) return false; if (TLI.getBooleanContents(N.getValueType()) == TargetLowering::UndefinedBooleanContent) return false; LHS = N.getOperand(0); RHS = N.getOperand(1); CC = N.getOperand(4); return true; } /// Return true if this is a SetCC-equivalent operation with only one use. /// If this is true, it allows the users to invert the operation for free when /// it is profitable to do so. bool DAGCombiner::isOneUseSetCC(SDValue N) const { SDValue N0, N1, N2; if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse()) return true; return false; } static SDValue peekThroughBitcast(SDValue V) { while (V.getOpcode() == ISD::BITCAST) V = V.getOperand(0); return V; } // Returns the SDNode if it is a constant float BuildVector // or constant float. static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) { if (isa(N)) return N.getNode(); if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode())) return N.getNode(); return nullptr; } // Determines if it is a constant integer or a build vector of constant // integers (and undefs). // Do not permit build vector implicit truncation. static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) { if (ConstantSDNode *Const = dyn_cast(N)) return !(Const->isOpaque() && NoOpaques); if (N.getOpcode() != ISD::BUILD_VECTOR) return false; unsigned BitWidth = N.getScalarValueSizeInBits(); for (const SDValue &Op : N->op_values()) { if (Op.isUndef()) continue; ConstantSDNode *Const = dyn_cast(Op); if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth || (Const->isOpaque() && NoOpaques)) return false; } return true; } // Determines if it is a constant null integer or a splatted vector of a // constant null integer (with no undefs). // Build vector implicit truncation is not an issue for null values. static bool isNullConstantOrNullSplatConstant(SDValue N) { // TODO: may want to use peekThroughBitcast() here. if (ConstantSDNode *Splat = isConstOrConstSplat(N)) return Splat->isNullValue(); return false; } // Determines if it is a constant integer of one or a splatted vector of a // constant integer of one (with no undefs). // Do not permit build vector implicit truncation. static bool isOneConstantOrOneSplatConstant(SDValue N) { // TODO: may want to use peekThroughBitcast() here. unsigned BitWidth = N.getScalarValueSizeInBits(); if (ConstantSDNode *Splat = isConstOrConstSplat(N)) return Splat->isOne() && Splat->getAPIntValue().getBitWidth() == BitWidth; return false; } // Determines if it is a constant integer of all ones or a splatted vector of a // constant integer of all ones (with no undefs). // Do not permit build vector implicit truncation. static bool isAllOnesConstantOrAllOnesSplatConstant(SDValue N) { N = peekThroughBitcast(N); unsigned BitWidth = N.getScalarValueSizeInBits(); if (ConstantSDNode *Splat = isConstOrConstSplat(N)) return Splat->isAllOnesValue() && Splat->getAPIntValue().getBitWidth() == BitWidth; return false; } // Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with // undef's. static bool isAnyConstantBuildVector(const SDNode *N) { return ISD::isBuildVectorOfConstantSDNodes(N) || ISD::isBuildVectorOfConstantFPSDNodes(N); } SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0, SDValue N1) { EVT VT = N0.getValueType(); if (N0.getOpcode() == Opc) { if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) { if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2)) if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, L, R)) return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode); return SDValue(); } if (N0.hasOneUse()) { // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one // use SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1); if (!OpNode.getNode()) return SDValue(); AddToWorklist(OpNode.getNode()); return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1)); } } } if (N1.getOpcode() == Opc) { if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) { if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) { // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2)) if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, R, L)) return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode); return SDValue(); } if (N1.hasOneUse()) { // reassoc. (op x, (op y, c1)) -> (op (op x, y), c1) iff x+c1 has one // use SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0, N1.getOperand(0)); if (!OpNode.getNode()) return SDValue(); AddToWorklist(OpNode.getNode()); return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1)); } } } return SDValue(); } SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo, bool AddTo) { assert(N->getNumValues() == NumTo && "Broken CombineTo call!"); ++NodesCombined; LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: "; To[0].getNode()->dump(&DAG); dbgs() << " and " << NumTo - 1 << " other values\n"); for (unsigned i = 0, e = NumTo; i != e; ++i) assert((!To[i].getNode() || N->getValueType(i) == To[i].getValueType()) && "Cannot combine value to value of different type!"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesWith(N, To); if (AddTo) { // Push the new nodes and any users onto the worklist for (unsigned i = 0, e = NumTo; i != e; ++i) { if (To[i].getNode()) { AddToWorklist(To[i].getNode()); AddUsersToWorklist(To[i].getNode()); } } } // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. if (N->use_empty()) deleteAndRecombine(N); return SDValue(N, 0); } void DAGCombiner:: CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) { // Replace all uses. If any nodes become isomorphic to other nodes and // are deleted, make sure to remove them from our worklist. WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New); // Push the new node and any (possibly new) users onto the worklist. AddToWorklist(TLO.New.getNode()); AddUsersToWorklist(TLO.New.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. if (TLO.Old.getNode()->use_empty()) deleteAndRecombine(TLO.Old.getNode()); } /// Check the specified integer node value to see if it can be simplified or if /// things it uses can be simplified by bit propagation. If so, return true. bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) { TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); KnownBits Known; if (!TLI.SimplifyDemandedBits(Op, Demanded, Known, TLO)) return false; // Revisit the node. AddToWorklist(Op.getNode()); // Replace the old value with the new one. ++NodesCombined; LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG); dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); dbgs() << '\n'); CommitTargetLoweringOpt(TLO); return true; } /// Check the specified vector node value to see if it can be simplified or /// if things it uses can be simplified as it only uses some of the elements. /// If so, return true. bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded, bool AssumeSingleUse) { TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations); APInt KnownUndef, KnownZero; if (!TLI.SimplifyDemandedVectorElts(Op, Demanded, KnownUndef, KnownZero, TLO, 0, AssumeSingleUse)) return false; // Revisit the node. AddToWorklist(Op.getNode()); // Replace the old value with the new one. ++NodesCombined; LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG); dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG); dbgs() << '\n'); CommitTargetLoweringOpt(TLO); return true; } void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) { SDLoc DL(Load); EVT VT = Load->getValueType(0); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0)); LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: "; Trunc.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1)); deleteAndRecombine(Load); AddToWorklist(Trunc.getNode()); } SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) { Replace = false; SDLoc DL(Op); if (ISD::isUNINDEXEDLoad(Op.getNode())) { LoadSDNode *LD = cast(Op); EVT MemVT = LD->getMemoryVT(); ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD : LD->getExtensionType(); Replace = true; return DAG.getExtLoad(ExtType, DL, PVT, LD->getChain(), LD->getBasePtr(), MemVT, LD->getMemOperand()); } unsigned Opc = Op.getOpcode(); switch (Opc) { default: break; case ISD::AssertSext: if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT)) return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1)); break; case ISD::AssertZext: if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT)) return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1)); break; case ISD::Constant: { unsigned ExtOpc = Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; return DAG.getNode(ExtOpc, DL, PVT, Op); } } if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT)) return SDValue(); return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op); } SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) { if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT)) return SDValue(); EVT OldVT = Op.getValueType(); SDLoc DL(Op); bool Replace = false; SDValue NewOp = PromoteOperand(Op, PVT, Replace); if (!NewOp.getNode()) return SDValue(); AddToWorklist(NewOp.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp, DAG.getValueType(OldVT)); } SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) { EVT OldVT = Op.getValueType(); SDLoc DL(Op); bool Replace = false; SDValue NewOp = PromoteOperand(Op, PVT, Replace); if (!NewOp.getNode()) return SDValue(); AddToWorklist(NewOp.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode()); return DAG.getZeroExtendInReg(NewOp, DL, OldVT); } /// Promote the specified integer binary operation if the target indicates it is /// beneficial. e.g. On x86, it's usually better to promote i16 operations to /// i32 since i16 instructions are longer. SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) { if (!LegalOperations) return SDValue(); EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return SDValue(); // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return SDValue(); EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); bool Replace0 = false; SDValue N0 = Op.getOperand(0); SDValue NN0 = PromoteOperand(N0, PVT, Replace0); bool Replace1 = false; SDValue N1 = Op.getOperand(1); SDValue NN1 = PromoteOperand(N1, PVT, Replace1); SDLoc DL(Op); SDValue RV = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1)); // We are always replacing N0/N1's use in N and only need // additional replacements if there are additional uses. Replace0 &= !N0->hasOneUse(); Replace1 &= (N0 != N1) && !N1->hasOneUse(); // Combine Op here so it is preserved past replacements. CombineTo(Op.getNode(), RV); // If operands have a use ordering, make sure we deal with // predecessor first. if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) { std::swap(N0, N1); std::swap(NN0, NN1); } if (Replace0) { AddToWorklist(NN0.getNode()); ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode()); } if (Replace1) { AddToWorklist(NN1.getNode()); ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode()); } return Op; } return SDValue(); } /// Promote the specified integer shift operation if the target indicates it is /// beneficial. e.g. On x86, it's usually better to promote i16 operations to /// i32 since i16 instructions are longer. SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) { if (!LegalOperations) return SDValue(); EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return SDValue(); // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return SDValue(); EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); bool Replace = false; SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); if (Opc == ISD::SRA) N0 = SExtPromoteOperand(N0, PVT); else if (Opc == ISD::SRL) N0 = ZExtPromoteOperand(N0, PVT); else N0 = PromoteOperand(N0, PVT, Replace); if (!N0.getNode()) return SDValue(); SDLoc DL(Op); SDValue RV = DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1)); AddToWorklist(N0.getNode()); if (Replace) ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode()); // Deal with Op being deleted. if (Op && Op.getOpcode() != ISD::DELETED_NODE) return RV; } return SDValue(); } SDValue DAGCombiner::PromoteExtend(SDValue Op) { if (!LegalOperations) return SDValue(); EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return SDValue(); // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return SDValue(); EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); // fold (aext (aext x)) -> (aext x) // fold (aext (zext x)) -> (zext x) // fold (aext (sext x)) -> (sext x) LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG)); return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0)); } return SDValue(); } bool DAGCombiner::PromoteLoad(SDValue Op) { if (!LegalOperations) return false; if (!ISD::isUNINDEXEDLoad(Op.getNode())) return false; EVT VT = Op.getValueType(); if (VT.isVector() || !VT.isInteger()) return false; // If operation type is 'undesirable', e.g. i16 on x86, consider // promoting it. unsigned Opc = Op.getOpcode(); if (TLI.isTypeDesirableForOp(Opc, VT)) return false; EVT PVT = VT; // Consult target whether it is a good idea to promote this operation and // what's the right type to promote it to. if (TLI.IsDesirableToPromoteOp(Op, PVT)) { assert(PVT != VT && "Don't know what type to promote to!"); SDLoc DL(Op); SDNode *N = Op.getNode(); LoadSDNode *LD = cast(N); EVT MemVT = LD->getMemoryVT(); ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD : LD->getExtensionType(); SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT, LD->getChain(), LD->getBasePtr(), MemVT, LD->getMemOperand()); SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD); LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1)); deleteAndRecombine(N); AddToWorklist(Result.getNode()); return true; } return false; } /// Recursively delete a node which has no uses and any operands for /// which it is the only use. /// /// Note that this both deletes the nodes and removes them from the worklist. /// It also adds any nodes who have had a user deleted to the worklist as they /// may now have only one use and subject to other combines. bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) { if (!N->use_empty()) return false; SmallSetVector Nodes; Nodes.insert(N); do { N = Nodes.pop_back_val(); if (!N) continue; if (N->use_empty()) { for (const SDValue &ChildN : N->op_values()) Nodes.insert(ChildN.getNode()); removeFromWorklist(N); DAG.DeleteNode(N); } else { AddToWorklist(N); } } while (!Nodes.empty()); return true; } //===----------------------------------------------------------------------===// // Main DAG Combiner implementation //===----------------------------------------------------------------------===// void DAGCombiner::Run(CombineLevel AtLevel) { // set the instance variables, so that the various visit routines may use it. Level = AtLevel; LegalOperations = Level >= AfterLegalizeVectorOps; LegalTypes = Level >= AfterLegalizeTypes; // Add all the dag nodes to the worklist. for (SDNode &Node : DAG.allnodes()) AddToWorklist(&Node); // Create a dummy node (which is not added to allnodes), that adds a reference // to the root node, preventing it from being deleted, and tracking any // changes of the root. HandleSDNode Dummy(DAG.getRoot()); // While the worklist isn't empty, find a node and try to combine it. while (!WorklistMap.empty()) { SDNode *N; // The Worklist holds the SDNodes in order, but it may contain null entries. do { N = Worklist.pop_back_val(); } while (!N); bool GoodWorklistEntry = WorklistMap.erase(N); (void)GoodWorklistEntry; assert(GoodWorklistEntry && "Found a worklist entry without a corresponding map entry!"); // If N has no uses, it is dead. Make sure to revisit all N's operands once // N is deleted from the DAG, since they too may now be dead or may have a // reduced number of uses, allowing other xforms. if (recursivelyDeleteUnusedNodes(N)) continue; WorklistRemover DeadNodes(*this); // If this combine is running after legalizing the DAG, re-legalize any // nodes pulled off the worklist. if (Level == AfterLegalizeDAG) { SmallSetVector UpdatedNodes; bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes); for (SDNode *LN : UpdatedNodes) { AddToWorklist(LN); AddUsersToWorklist(LN); } if (!NIsValid) continue; } LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG)); // Add any operands of the new node which have not yet been combined to the // worklist as well. Because the worklist uniques things already, this // won't repeatedly process the same operand. CombinedNodes.insert(N); for (const SDValue &ChildN : N->op_values()) if (!CombinedNodes.count(ChildN.getNode())) AddToWorklist(ChildN.getNode()); SDValue RV = combine(N); if (!RV.getNode()) continue; ++NodesCombined; // If we get back the same node we passed in, rather than a new node or // zero, we know that the node must have defined multiple values and // CombineTo was used. Since CombineTo takes care of the worklist // mechanics for us, we have no work to do in this case. if (RV.getNode() == N) continue; assert(N->getOpcode() != ISD::DELETED_NODE && RV.getOpcode() != ISD::DELETED_NODE && "Node was deleted but visit returned new node!"); LLVM_DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG)); if (N->getNumValues() == RV.getNode()->getNumValues()) DAG.ReplaceAllUsesWith(N, RV.getNode()); else { assert(N->getValueType(0) == RV.getValueType() && N->getNumValues() == 1 && "Type mismatch"); DAG.ReplaceAllUsesWith(N, &RV); } // Push the new node and any users onto the worklist AddToWorklist(RV.getNode()); AddUsersToWorklist(RV.getNode()); // Finally, if the node is now dead, remove it from the graph. The node // may not be dead if the replacement process recursively simplified to // something else needing this node. This will also take care of adding any // operands which have lost a user to the worklist. recursivelyDeleteUnusedNodes(N); } // If the root changed (e.g. it was a dead load, update the root). DAG.setRoot(Dummy.getValue()); DAG.RemoveDeadNodes(); } SDValue DAGCombiner::visit(SDNode *N) { switch (N->getOpcode()) { default: break; case ISD::TokenFactor: return visitTokenFactor(N); case ISD::MERGE_VALUES: return visitMERGE_VALUES(N); case ISD::ADD: return visitADD(N); case ISD::SUB: return visitSUB(N); case ISD::ADDC: return visitADDC(N); case ISD::UADDO: return visitUADDO(N); case ISD::SUBC: return visitSUBC(N); case ISD::USUBO: return visitUSUBO(N); case ISD::ADDE: return visitADDE(N); case ISD::ADDCARRY: return visitADDCARRY(N); case ISD::SUBE: return visitSUBE(N); case ISD::SUBCARRY: return visitSUBCARRY(N); case ISD::MUL: return visitMUL(N); case ISD::SDIV: return visitSDIV(N); case ISD::UDIV: return visitUDIV(N); case ISD::SREM: case ISD::UREM: return visitREM(N); case ISD::MULHU: return visitMULHU(N); case ISD::MULHS: return visitMULHS(N); case ISD::SMUL_LOHI: return visitSMUL_LOHI(N); case ISD::UMUL_LOHI: return visitUMUL_LOHI(N); case ISD::SMULO: return visitSMULO(N); case ISD::UMULO: return visitUMULO(N); case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: case ISD::UMAX: return visitIMINMAX(N); case ISD::AND: return visitAND(N); case ISD::OR: return visitOR(N); case ISD::XOR: return visitXOR(N); case ISD::SHL: return visitSHL(N); case ISD::SRA: return visitSRA(N); case ISD::SRL: return visitSRL(N); case ISD::ROTR: case ISD::ROTL: return visitRotate(N); case ISD::ABS: return visitABS(N); case ISD::BSWAP: return visitBSWAP(N); case ISD::BITREVERSE: return visitBITREVERSE(N); case ISD::CTLZ: return visitCTLZ(N); case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N); case ISD::CTTZ: return visitCTTZ(N); case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N); case ISD::CTPOP: return visitCTPOP(N); case ISD::SELECT: return visitSELECT(N); case ISD::VSELECT: return visitVSELECT(N); case ISD::SELECT_CC: return visitSELECT_CC(N); case ISD::SETCC: return visitSETCC(N); case ISD::SETCCCARRY: return visitSETCCCARRY(N); case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N); case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N); case ISD::ANY_EXTEND: return visitANY_EXTEND(N); case ISD::AssertSext: case ISD::AssertZext: return visitAssertExt(N); case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N); case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N); case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N); case ISD::TRUNCATE: return visitTRUNCATE(N); case ISD::BITCAST: return visitBITCAST(N); case ISD::BUILD_PAIR: return visitBUILD_PAIR(N); case ISD::FADD: return visitFADD(N); case ISD::FSUB: return visitFSUB(N); case ISD::FMUL: return visitFMUL(N); case ISD::FMA: return visitFMA(N); case ISD::FDIV: return visitFDIV(N); case ISD::FREM: return visitFREM(N); case ISD::FSQRT: return visitFSQRT(N); case ISD::FCOPYSIGN: return visitFCOPYSIGN(N); case ISD::SINT_TO_FP: return visitSINT_TO_FP(N); case ISD::UINT_TO_FP: return visitUINT_TO_FP(N); case ISD::FP_TO_SINT: return visitFP_TO_SINT(N); case ISD::FP_TO_UINT: return visitFP_TO_UINT(N); case ISD::FP_ROUND: return visitFP_ROUND(N); case ISD::FP_ROUND_INREG: return visitFP_ROUND_INREG(N); case ISD::FP_EXTEND: return visitFP_EXTEND(N); case ISD::FNEG: return visitFNEG(N); case ISD::FABS: return visitFABS(N); case ISD::FFLOOR: return visitFFLOOR(N); case ISD::FMINNUM: return visitFMINNUM(N); case ISD::FMAXNUM: return visitFMAXNUM(N); case ISD::FCEIL: return visitFCEIL(N); case ISD::FTRUNC: return visitFTRUNC(N); case ISD::BRCOND: return visitBRCOND(N); case ISD::BR_CC: return visitBR_CC(N); case ISD::LOAD: return visitLOAD(N); case ISD::STORE: return visitSTORE(N); case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N); case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N); case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N); case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N); case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N); case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N); case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N); case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N); case ISD::MGATHER: return visitMGATHER(N); case ISD::MLOAD: return visitMLOAD(N); case ISD::MSCATTER: return visitMSCATTER(N); case ISD::MSTORE: return visitMSTORE(N); case ISD::FP_TO_FP16: return visitFP_TO_FP16(N); case ISD::FP16_TO_FP: return visitFP16_TO_FP(N); } return SDValue(); } SDValue DAGCombiner::combine(SDNode *N) { SDValue RV = visit(N); // If nothing happened, try a target-specific DAG combine. if (!RV.getNode()) { assert(N->getOpcode() != ISD::DELETED_NODE && "Node was deleted but visit returned NULL!"); if (N->getOpcode() >= ISD::BUILTIN_OP_END || TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) { // Expose the DAG combiner to the target combiner impls. TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, Level, false, this); RV = TLI.PerformDAGCombine(N, DagCombineInfo); } } // If nothing happened still, try promoting the operation. if (!RV.getNode()) { switch (N->getOpcode()) { default: break; case ISD::ADD: case ISD::SUB: case ISD::MUL: case ISD::AND: case ISD::OR: case ISD::XOR: RV = PromoteIntBinOp(SDValue(N, 0)); break; case ISD::SHL: case ISD::SRA: case ISD::SRL: RV = PromoteIntShiftOp(SDValue(N, 0)); break; case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: RV = PromoteExtend(SDValue(N, 0)); break; case ISD::LOAD: if (PromoteLoad(SDValue(N, 0))) RV = SDValue(N, 0); break; } } // If N is a commutative binary node, try eliminate it if the commuted // version is already present in the DAG. if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) && N->getNumValues() == 1) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Constant operands are canonicalized to RHS. if (N0 != N1 && (isa(N0) || !isa(N1))) { SDValue Ops[] = {N1, N0}; SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops, N->getFlags()); if (CSENode) return SDValue(CSENode, 0); } } return RV; } /// Given a node, return its input chain if it has one, otherwise return a null /// sd operand. static SDValue getInputChainForNode(SDNode *N) { if (unsigned NumOps = N->getNumOperands()) { if (N->getOperand(0).getValueType() == MVT::Other) return N->getOperand(0); if (N->getOperand(NumOps-1).getValueType() == MVT::Other) return N->getOperand(NumOps-1); for (unsigned i = 1; i < NumOps-1; ++i) if (N->getOperand(i).getValueType() == MVT::Other) return N->getOperand(i); } return SDValue(); } SDValue DAGCombiner::visitTokenFactor(SDNode *N) { // If N has two operands, where one has an input chain equal to the other, // the 'other' chain is redundant. if (N->getNumOperands() == 2) { if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1)) return N->getOperand(0); if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0)) return N->getOperand(1); } // Don't simplify token factors if optnone. if (OptLevel == CodeGenOpt::None) return SDValue(); SmallVector TFs; // List of token factors to visit. SmallVector Ops; // Ops for replacing token factor. SmallPtrSet SeenOps; bool Changed = false; // If we should replace this token factor. // Start out with this token factor. TFs.push_back(N); // Iterate through token factors. The TFs grows when new token factors are // encountered. for (unsigned i = 0; i < TFs.size(); ++i) { SDNode *TF = TFs[i]; // Check each of the operands. for (const SDValue &Op : TF->op_values()) { switch (Op.getOpcode()) { case ISD::EntryToken: // Entry tokens don't need to be added to the list. They are // redundant. Changed = true; break; case ISD::TokenFactor: if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) { // Queue up for processing. TFs.push_back(Op.getNode()); // Clean up in case the token factor is removed. AddToWorklist(Op.getNode()); Changed = true; break; } LLVM_FALLTHROUGH; default: // Only add if it isn't already in the list. if (SeenOps.insert(Op.getNode()).second) Ops.push_back(Op); else Changed = true; break; } } } // Remove Nodes that are chained to another node in the list. Do so // by walking up chains breath-first stopping when we've seen // another operand. In general we must climb to the EntryNode, but we can exit // early if we find all remaining work is associated with just one operand as // no further pruning is possible. // List of nodes to search through and original Ops from which they originate. SmallVector, 8> Worklist; SmallVector OpWorkCount; // Count of work for each Op. SmallPtrSet SeenChains; bool DidPruneOps = false; unsigned NumLeftToConsider = 0; for (const SDValue &Op : Ops) { Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++)); OpWorkCount.push_back(1); } auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) { // If this is an Op, we can remove the op from the list. Remark any // search associated with it as from the current OpNumber. if (SeenOps.count(Op) != 0) { Changed = true; DidPruneOps = true; unsigned OrigOpNumber = 0; while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op) OrigOpNumber++; assert((OrigOpNumber != Ops.size()) && "expected to find TokenFactor Operand"); // Re-mark worklist from OrigOpNumber to OpNumber for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) { if (Worklist[i].second == OrigOpNumber) { Worklist[i].second = OpNumber; } } OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber]; OpWorkCount[OrigOpNumber] = 0; NumLeftToConsider--; } // Add if it's a new chain if (SeenChains.insert(Op).second) { OpWorkCount[OpNumber]++; Worklist.push_back(std::make_pair(Op, OpNumber)); } }; for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) { // We need at least be consider at least 2 Ops to prune. if (NumLeftToConsider <= 1) break; auto CurNode = Worklist[i].first; auto CurOpNumber = Worklist[i].second; assert((OpWorkCount[CurOpNumber] > 0) && "Node should not appear in worklist"); switch (CurNode->getOpcode()) { case ISD::EntryToken: // Hitting EntryToken is the only way for the search to terminate without // hitting // another operand's search. Prevent us from marking this operand // considered. NumLeftToConsider++; break; case ISD::TokenFactor: for (const SDValue &Op : CurNode->op_values()) AddToWorklist(i, Op.getNode(), CurOpNumber); break; case ISD::CopyFromReg: case ISD::CopyToReg: AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber); break; default: if (auto *MemNode = dyn_cast(CurNode)) AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber); break; } OpWorkCount[CurOpNumber]--; if (OpWorkCount[CurOpNumber] == 0) NumLeftToConsider--; } // If we've changed things around then replace token factor. if (Changed) { SDValue Result; if (Ops.empty()) { // The entry token is the only possible outcome. Result = DAG.getEntryNode(); } else { if (DidPruneOps) { SmallVector PrunedOps; // for (const SDValue &Op : Ops) { if (SeenChains.count(Op.getNode()) == 0) PrunedOps.push_back(Op); } Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps); } else { Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); } } return Result; } return SDValue(); } /// MERGE_VALUES can always be eliminated. SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) { WorklistRemover DeadNodes(*this); // Replacing results may cause a different MERGE_VALUES to suddenly // be CSE'd with N, and carry its uses with it. Iterate until no // uses remain, to ensure that the node can be safely deleted. // First add the users of this node to the work list so that they // can be tried again once they have new operands. AddUsersToWorklist(N); do { for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i)); } while (!N->use_empty()); deleteAndRecombine(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } /// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a /// ConstantSDNode pointer else nullptr. static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) { ConstantSDNode *Const = dyn_cast(N); return Const != nullptr && !Const->isOpaque() ? Const : nullptr; } SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) { auto BinOpcode = BO->getOpcode(); assert((BinOpcode == ISD::ADD || BinOpcode == ISD::SUB || BinOpcode == ISD::MUL || BinOpcode == ISD::SDIV || BinOpcode == ISD::UDIV || BinOpcode == ISD::SREM || BinOpcode == ISD::UREM || BinOpcode == ISD::AND || BinOpcode == ISD::OR || BinOpcode == ISD::XOR || BinOpcode == ISD::SHL || BinOpcode == ISD::SRL || BinOpcode == ISD::SRA || BinOpcode == ISD::FADD || BinOpcode == ISD::FSUB || BinOpcode == ISD::FMUL || BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) && "Unexpected binary operator"); // Don't do this unless the old select is going away. We want to eliminate the // binary operator, not replace a binop with a select. // TODO: Handle ISD::SELECT_CC. unsigned SelOpNo = 0; SDValue Sel = BO->getOperand(0); if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) { SelOpNo = 1; Sel = BO->getOperand(1); } if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) return SDValue(); SDValue CT = Sel.getOperand(1); if (!isConstantOrConstantVector(CT, true) && !isConstantFPBuildVectorOrConstantFP(CT)) return SDValue(); SDValue CF = Sel.getOperand(2); if (!isConstantOrConstantVector(CF, true) && !isConstantFPBuildVectorOrConstantFP(CF)) return SDValue(); // Bail out if any constants are opaque because we can't constant fold those. // The exception is "and" and "or" with either 0 or -1 in which case we can // propagate non constant operands into select. I.e.: // and (select Cond, 0, -1), X --> select Cond, 0, X // or X, (select Cond, -1, 0) --> select Cond, -1, X bool CanFoldNonConst = (BinOpcode == ISD::AND || BinOpcode == ISD::OR) && (isNullConstantOrNullSplatConstant(CT) || isAllOnesConstantOrAllOnesSplatConstant(CT)) && (isNullConstantOrNullSplatConstant(CF) || isAllOnesConstantOrAllOnesSplatConstant(CF)); SDValue CBO = BO->getOperand(SelOpNo ^ 1); if (!CanFoldNonConst && !isConstantOrConstantVector(CBO, true) && !isConstantFPBuildVectorOrConstantFP(CBO)) return SDValue(); EVT VT = Sel.getValueType(); // In case of shift value and shift amount may have different VT. For instance // on x86 shift amount is i8 regardles of LHS type. Bail out if we have // swapped operands and value types do not match. NB: x86 is fine if operands // are not swapped with shift amount VT being not bigger than shifted value. // TODO: that is possible to check for a shift operation, correct VTs and // still perform optimization on x86 if needed. if (SelOpNo && VT != CBO.getValueType()) return SDValue(); // We have a select-of-constants followed by a binary operator with a // constant. Eliminate the binop by pulling the constant math into the select. // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO SDLoc DL(Sel); SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT) : DAG.getNode(BinOpcode, DL, VT, CT, CBO); if (!CanFoldNonConst && !NewCT.isUndef() && !isConstantOrConstantVector(NewCT, true) && !isConstantFPBuildVectorOrConstantFP(NewCT)) return SDValue(); SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF) : DAG.getNode(BinOpcode, DL, VT, CF, CBO); if (!CanFoldNonConst && !NewCF.isUndef() && !isConstantOrConstantVector(NewCF, true) && !isConstantFPBuildVectorOrConstantFP(NewCF)) return SDValue(); return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF); } static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) { assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && "Expecting add or sub"); // Match a constant operand and a zext operand for the math instruction: // add Z, C // sub C, Z bool IsAdd = N->getOpcode() == ISD::ADD; SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0); SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1); auto *CN = dyn_cast(C); if (!CN || Z.getOpcode() != ISD::ZERO_EXTEND) return SDValue(); // Match the zext operand as a setcc of a boolean. if (Z.getOperand(0).getOpcode() != ISD::SETCC || Z.getOperand(0).getValueType() != MVT::i1) return SDValue(); // Match the compare as: setcc (X & 1), 0, eq. SDValue SetCC = Z.getOperand(0); ISD::CondCode CC = cast(SetCC->getOperand(2))->get(); if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) || SetCC.getOperand(0).getOpcode() != ISD::AND || !isOneConstant(SetCC.getOperand(0).getOperand(1))) return SDValue(); // We are adding/subtracting a constant and an inverted low bit. Turn that // into a subtract/add of the low bit with incremented/decremented constant: // add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1)) // sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1)) EVT VT = C.getValueType(); SDLoc DL(N); SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT); SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) : DAG.getConstant(CN->getAPIntValue() - 1, DL, VT); return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit); } /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into /// a shift and add with a different constant. static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) { assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && "Expecting add or sub"); // We need a constant operand for the add/sub, and the other operand is a // logical shift right: add (srl), C or sub C, (srl). bool IsAdd = N->getOpcode() == ISD::ADD; SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0); SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1); ConstantSDNode *C = isConstOrConstSplat(ConstantOp); if (!C || ShiftOp.getOpcode() != ISD::SRL) return SDValue(); // The shift must be of a 'not' value. // TODO: Use isBitwiseNot() if it works with vectors. SDValue Not = ShiftOp.getOperand(0); if (!Not.hasOneUse() || Not.getOpcode() != ISD::XOR || !isAllOnesConstantOrAllOnesSplatConstant(Not.getOperand(1))) return SDValue(); // The shift must be moving the sign bit to the least-significant-bit. EVT VT = ShiftOp.getValueType(); SDValue ShAmt = ShiftOp.getOperand(1); ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt); if (!ShAmtC || ShAmtC->getZExtValue() != VT.getScalarSizeInBits() - 1) return SDValue(); // Eliminate the 'not' by adjusting the shift and add/sub constant: // add (srl (not X), 31), C --> add (sra X, 31), (C + 1) // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1) SDLoc DL(N); auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL; SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt); APInt NewC = IsAdd ? C->getAPIntValue() + 1 : C->getAPIntValue() - 1; return DAG.getNode(ISD::ADD, DL, VT, NewShift, DAG.getConstant(NewC, DL, VT)); } SDValue DAGCombiner::visitADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); SDLoc DL(N); // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (add x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; if (ISD::isBuildVectorAllZeros(N0.getNode())) return N1; } // fold (add x, undef) -> undef if (N0.isUndef()) return N0; if (N1.isUndef()) return N1; if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { // canonicalize constant to RHS if (!DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::ADD, DL, VT, N1, N0); // fold (add c1, c2) -> c1+c2 return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, N0.getNode(), N1.getNode()); } // fold (add x, 0) -> x if (isNullConstant(N1)) return N0; if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) { // fold ((c1-A)+c2) -> (c1+c2)-A if (N0.getOpcode() == ISD::SUB && isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) { // FIXME: Adding 2 constants should be handled by FoldConstantArithmetic. return DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)), N0.getOperand(1)); } // add (sext i1 X), 1 -> zext (not i1 X) // We don't transform this pattern: // add (zext i1 X), -1 -> sext (not i1 X) // because most (?) targets generate better code for the zext form. if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() && isOneConstantOrOneSplatConstant(N1)) { SDValue X = N0.getOperand(0); if ((!LegalOperations || (TLI.isOperationLegal(ISD::XOR, X.getValueType()) && TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) && X.getScalarValueSizeInBits() == 1) { SDValue Not = DAG.getNOT(DL, X, X.getValueType()); return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not); } } // Undo the add -> or combine to merge constant offsets from a frame index. if (N0.getOpcode() == ISD::OR && isa(N0.getOperand(0)) && isa(N0.getOperand(1)) && DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) { SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1)); return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0); } } if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // reassociate add if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1)) return RADD; // fold ((0-A) + B) -> B-A if (N0.getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N0.getOperand(0))) return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1)); // fold (A + (0-B)) -> A-B if (N1.getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N1.getOperand(0))) return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1)); // fold (A+(B-A)) -> B if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1)) return N1.getOperand(0); // fold ((B-A)+A) -> B if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1)) return N0.getOperand(0); // fold (A+(B-(A+C))) to (B-C) if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && N0 == N1.getOperand(1).getOperand(0)) return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), N1.getOperand(1).getOperand(1)); // fold (A+(B-(C+A))) to (B-C) if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD && N0 == N1.getOperand(1).getOperand(1)) return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0), N1.getOperand(1).getOperand(0)); // fold (A+((B-A)+or-C)) to (B+or-C) if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) && N1.getOperand(0).getOpcode() == ISD::SUB && N0 == N1.getOperand(0).getOperand(1)) return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0), N1.getOperand(1)); // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); SDValue N10 = N1.getOperand(0); SDValue N11 = N1.getOperand(1); if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10)) return DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10), DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11)); } if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG)) return V; if (SDValue V = foldAddSubOfSignBit(N, DAG)) return V; if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (a+b) -> (a|b) iff a and b share no bits. if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) && DAG.haveNoCommonBitsSet(N0, N1)) return DAG.getNode(ISD::OR, DL, VT, N0, N1); // fold (add (xor a, -1), 1) -> (sub 0, a) if (isBitwiseNot(N0) && isOneConstantOrOneSplatConstant(N1)) return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0.getOperand(0)); if (SDValue Combined = visitADDLike(N0, N1, N)) return Combined; if (SDValue Combined = visitADDLike(N1, N0, N)) return Combined; return SDValue(); } static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) { bool Masked = false; // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization. while (true) { if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) { V = V.getOperand(0); continue; } if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) { Masked = true; V = V.getOperand(0); continue; } break; } // If this is not a carry, return. if (V.getResNo() != 1) return SDValue(); if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY && V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO) return SDValue(); // If the result is masked, then no matter what kind of bool it is we can // return. If it isn't, then we need to make sure the bool type is either 0 or // 1 and not other values. if (Masked || TLI.getBooleanContents(V.getValueType()) == TargetLoweringBase::ZeroOrOneBooleanContent) return V; return SDValue(); } SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) { EVT VT = N0.getValueType(); SDLoc DL(LocReference); // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n)) if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) return DAG.getNode(ISD::SUB, DL, VT, N0, DAG.getNode(ISD::SHL, DL, VT, N1.getOperand(0).getOperand(1), N1.getOperand(1))); if (N1.getOpcode() == ISD::AND) { SDValue AndOp0 = N1.getOperand(0); unsigned NumSignBits = DAG.ComputeNumSignBits(AndOp0); unsigned DestBits = VT.getScalarSizeInBits(); // (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x)) // and similar xforms where the inner op is either ~0 or 0. if (NumSignBits == DestBits && isOneConstantOrOneSplatConstant(N1->getOperand(1))) return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0); } // add (sext i1), X -> sub X, (zext i1) if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.getOperand(0).getValueType() == MVT::i1 && !TLI.isOperationLegal(ISD::SIGN_EXTEND, MVT::i1)) { SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)); return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt); } // add X, (sextinreg Y i1) -> sub X, (and Y 1) if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { VTSDNode *TN = cast(N1.getOperand(1)); if (TN->getVT() == MVT::i1) { SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), DAG.getConstant(1, DL, VT)); return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt); } } // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) && N1.getResNo() == 0) return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(), N0, N1.getOperand(0), N1.getOperand(2)); // (add X, Carry) -> (addcarry X, 0, Carry) if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) if (SDValue Carry = getAsCarry(TLI, N1)) return DAG.getNode(ISD::ADDCARRY, DL, DAG.getVTList(VT, Carry.getValueType()), N0, DAG.getConstant(0, DL, VT), Carry); return SDValue(); } SDValue DAGCombiner::visitADDC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); SDLoc DL(N); // If the flag result is dead, turn this into an ADD. if (!N->hasAnyUseOfValue(1)) return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // canonicalize constant to RHS. ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && !N1C) return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0); // fold (addc x, 0) -> x + no carry out if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // If it cannot overflow, transform into an add. if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); return SDValue(); } static SDValue flipBoolean(SDValue V, const SDLoc &DL, EVT VT, SelectionDAG &DAG, const TargetLowering &TLI) { SDValue Cst; switch (TLI.getBooleanContents(VT)) { case TargetLowering::ZeroOrOneBooleanContent: case TargetLowering::UndefinedBooleanContent: Cst = DAG.getConstant(1, DL, VT); break; case TargetLowering::ZeroOrNegativeOneBooleanContent: Cst = DAG.getConstant(-1, DL, VT); break; } return DAG.getNode(ISD::XOR, DL, VT, V, Cst); } static bool isBooleanFlip(SDValue V, EVT VT, const TargetLowering &TLI) { if (V.getOpcode() != ISD::XOR) return false; ConstantSDNode *Const = dyn_cast(V.getOperand(1)); if (!Const) return false; switch(TLI.getBooleanContents(VT)) { case TargetLowering::ZeroOrOneBooleanContent: return Const->isOne(); case TargetLowering::ZeroOrNegativeOneBooleanContent: return Const->isAllOnesValue(); case TargetLowering::UndefinedBooleanContent: return (Const->getAPIntValue() & 0x01) == 1; } llvm_unreachable("Unsupported boolean content"); } SDValue DAGCombiner::visitUADDO(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); if (VT.isVector()) return SDValue(); EVT CarryVT = N->getValueType(1); SDLoc DL(N); // If the flag result is dead, turn this into an ADD. if (!N->hasAnyUseOfValue(1)) return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), DAG.getUNDEF(CarryVT)); // canonicalize constant to RHS. ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && !N1C) return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0); // fold (uaddo x, 0) -> x + no carry out if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); // If it cannot overflow, transform into an add. if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never) return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1), DAG.getConstant(0, DL, CarryVT)); // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry. if (isBitwiseNot(N0) && isOneConstantOrOneSplatConstant(N1)) { SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(), DAG.getConstant(0, DL, VT), N0.getOperand(0)); return CombineTo(N, Sub, flipBoolean(Sub.getValue(1), DL, CarryVT, DAG, TLI)); } if (SDValue Combined = visitUADDOLike(N0, N1, N)) return Combined; if (SDValue Combined = visitUADDOLike(N1, N0, N)) return Combined; return SDValue(); } SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) { auto VT = N0.getValueType(); // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry) // If Y + 1 cannot overflow. if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) { SDValue Y = N1.getOperand(0); SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType()); if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never) return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y, N1.getOperand(2)); } // (uaddo X, Carry) -> (addcarry X, 0, Carry) if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) if (SDValue Carry = getAsCarry(TLI, N1)) return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, DAG.getConstant(0, SDLoc(N), VT), Carry); return SDValue(); } SDValue DAGCombiner::visitADDE(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); // canonicalize constant to RHS ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && !N1C) return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(), N1, N0, CarryIn); // fold (adde x, y, false) -> (addc x, y) if (CarryIn.getOpcode() == ISD::CARRY_FALSE) return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1); return SDValue(); } SDValue DAGCombiner::visitADDCARRY(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); SDLoc DL(N); // canonicalize constant to RHS ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && !N1C) return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn); // fold (addcarry x, y, false) -> (uaddo x, y) if (isNullConstant(CarryIn)) { if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0))) return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); } EVT CarryVT = CarryIn.getValueType(); // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry. if (isNullConstant(N0) && isNullConstant(N1)) { EVT VT = N0.getValueType(); SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT); AddToWorklist(CarryExt.getNode()); return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt, DAG.getConstant(1, DL, VT)), DAG.getConstant(0, DL, CarryVT)); } // fold (addcarry (xor a, -1), 0, !b) -> (subcarry 0, a, b) and flip carry. if (isBitwiseNot(N0) && isNullConstant(N1) && isBooleanFlip(CarryIn, CarryVT, TLI)) { SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), DAG.getConstant(0, DL, N0.getValueType()), N0.getOperand(0), CarryIn.getOperand(0)); return CombineTo(N, Sub, flipBoolean(Sub.getValue(1), DL, CarryVT, DAG, TLI)); } if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N)) return Combined; if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N)) return Combined; return SDValue(); } SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N) { // Iff the flag result is dead: // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry) if ((N0.getOpcode() == ISD::ADD || (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0)) && isNullConstant(N1) && !N->hasAnyUseOfValue(1)) return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0.getOperand(0), N0.getOperand(1), CarryIn); /** * When one of the addcarry argument is itself a carry, we may be facing * a diamond carry propagation. In which case we try to transform the DAG * to ensure linear carry propagation if that is possible. * * We are trying to get: * (addcarry X, 0, (addcarry A, B, Z):Carry) */ if (auto Y = getAsCarry(TLI, N1)) { /** * (uaddo A, B) * / \ * Carry Sum * | \ * | (addcarry *, 0, Z) * | / * \ Carry * | / * (addcarry X, *, *) */ if (Y.getOpcode() == ISD::UADDO && CarryIn.getResNo() == 1 && CarryIn.getOpcode() == ISD::ADDCARRY && isNullConstant(CarryIn.getOperand(1)) && CarryIn.getOperand(0) == Y.getValue(0)) { auto NewY = DAG.getNode(ISD::ADDCARRY, SDLoc(N), Y->getVTList(), Y.getOperand(0), Y.getOperand(1), CarryIn.getOperand(2)); AddToWorklist(NewY.getNode()); return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, DAG.getConstant(0, SDLoc(N), N0.getValueType()), NewY.getValue(1)); } } return SDValue(); } // Since it may not be valid to emit a fold to zero for vector initializers // check if we can before folding. static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT, SelectionDAG &DAG, bool LegalOperations, bool LegalTypes) { if (!VT.isVector()) return DAG.getConstant(0, DL, VT); if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) return DAG.getConstant(0, DL, VT); return SDValue(); } SDValue DAGCombiner::visitSUB(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); SDLoc DL(N); // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (sub x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; } // fold (sub x, x) -> 0 // FIXME: Refactor this and xor and other similar operations together. if (N0 == N1) return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations, LegalTypes); if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && DAG.isConstantIntBuildVectorOrConstantInt(N1)) { // fold (sub c1, c2) -> c1-c2 return DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N0.getNode(), N1.getNode()); } if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); // fold (sub x, c) -> (add x, -c) if (N1C) { return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getConstant(-N1C->getAPIntValue(), DL, VT)); } if (isNullConstantOrNullSplatConstant(N0)) { unsigned BitWidth = VT.getScalarSizeInBits(); // Right-shifting everything out but the sign bit followed by negation is // the same as flipping arithmetic/logical shift type without the negation: // -(X >>u 31) -> (X >>s 31) // -(X >>s 31) -> (X >>u 31) if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) { ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1)); if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1) { auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA; if (!LegalOperations || TLI.isOperationLegal(NewSh, VT)) return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1)); } } // 0 - X --> 0 if the sub is NUW. if (N->getFlags().hasNoUnsignedWrap()) return N0; if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) { // N1 is either 0 or the minimum signed value. If the sub is NSW, then // N1 must be 0 because negating the minimum signed value is undefined. if (N->getFlags().hasNoSignedWrap()) return N0; // 0 - X --> X if X is 0 or the minimum signed value. return N1; } } // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) if (isAllOnesConstantOrAllOnesSplatConstant(N0)) return DAG.getNode(ISD::XOR, DL, VT, N1, N0); // fold (A - (0-B)) -> A+B if (N1.getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N1.getOperand(0))) return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1)); // fold A-(A-B) -> B if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0)) return N1.getOperand(1); // fold (A+B)-A -> B if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1) return N0.getOperand(1); // fold (A+B)-B -> A if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1) return N0.getOperand(0); // fold C2-(A+C1) -> (C2-C1)-A if (N1.getOpcode() == ISD::ADD) { SDValue N11 = N1.getOperand(1); if (isConstantOrConstantVector(N0, /* NoOpaques */ true) && isConstantOrConstantVector(N11, /* NoOpaques */ true)) { SDValue NewC = DAG.getNode(ISD::SUB, DL, VT, N0, N11); return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0)); } } // fold ((A+(B+or-C))-B) -> A+or-C if (N0.getOpcode() == ISD::ADD && (N0.getOperand(1).getOpcode() == ISD::SUB || N0.getOperand(1).getOpcode() == ISD::ADD) && N0.getOperand(1).getOperand(0) == N1) return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0), N0.getOperand(1).getOperand(1)); // fold ((A+(C+B))-B) -> A+C if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD && N0.getOperand(1).getOperand(1) == N1) return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N0.getOperand(1).getOperand(0)); // fold ((A-(B-C))-C) -> A-B if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB && N0.getOperand(1).getOperand(1) == N1) return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N0.getOperand(1).getOperand(0)); // fold (A-(B-C)) -> A+(C-B) if (N1.getOpcode() == ISD::SUB && N1.hasOneUse()) return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1), N1.getOperand(0))); // fold (X - (-Y * Z)) -> (X + (Y * Z)) if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) { if (N1.getOperand(0).getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) { SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, N1.getOperand(0).getOperand(1), N1.getOperand(1)); return DAG.getNode(ISD::ADD, DL, VT, N0, Mul); } if (N1.getOperand(1).getOpcode() == ISD::SUB && isNullConstantOrNullSplatConstant(N1.getOperand(1).getOperand(0))) { SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, N1.getOperand(0), N1.getOperand(1).getOperand(1)); return DAG.getNode(ISD::ADD, DL, VT, N0, Mul); } } // If either operand of a sub is undef, the result is undef if (N0.isUndef()) return N0; if (N1.isUndef()) return N1; if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG)) return V; if (SDValue V = foldAddSubOfSignBit(N, DAG)) return V; // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X) if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) { SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1); SDValue S0 = N1.getOperand(0); if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) { unsigned OpSizeInBits = VT.getScalarSizeInBits(); if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) if (C->getAPIntValue() == (OpSizeInBits - 1)) return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0); } } } // If the relocation model supports it, consider symbol offsets. if (GlobalAddressSDNode *GA = dyn_cast(N0)) if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) { // fold (sub Sym, c) -> Sym-c if (N1C && GA->getOpcode() == ISD::GlobalAddress) return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT, GA->getOffset() - (uint64_t)N1C->getSExtValue()); // fold (sub Sym+c1, Sym+c2) -> c1-c2 if (GlobalAddressSDNode *GB = dyn_cast(N1)) if (GA->getGlobal() == GB->getGlobal()) return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(), DL, VT); } // sub X, (sextinreg Y i1) -> add X, (and Y 1) if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) { VTSDNode *TN = cast(N1.getOperand(1)); if (TN->getVT() == MVT::i1) { SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0), DAG.getConstant(1, DL, VT)); return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt); } } // Prefer an add for more folding potential and possibly better codegen: // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1) if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) { SDValue ShAmt = N1.getOperand(1); ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt); if (ShAmtC && ShAmtC->getZExtValue() == N1.getScalarValueSizeInBits() - 1) { SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt); return DAG.getNode(ISD::ADD, DL, VT, N0, SRA); } } return SDValue(); } SDValue DAGCombiner::visitSUBC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); SDLoc DL(N); // If the flag result is dead, turn this into an SUB. if (!N->hasAnyUseOfValue(1)) return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // fold (subc x, x) -> 0 + no borrow if (N0 == N1) return CombineTo(N, DAG.getConstant(0, DL, VT), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // fold (subc x, 0) -> x + no borrow if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow if (isAllOnesConstant(N0)) return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue)); return SDValue(); } SDValue DAGCombiner::visitUSUBO(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); if (VT.isVector()) return SDValue(); EVT CarryVT = N->getValueType(1); SDLoc DL(N); // If the flag result is dead, turn this into an SUB. if (!N->hasAnyUseOfValue(1)) return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1), DAG.getUNDEF(CarryVT)); // fold (usubo x, x) -> 0 + no borrow if (N0 == N1) return CombineTo(N, DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, CarryVT)); // fold (usubo x, 0) -> x + no borrow if (isNullConstant(N1)) return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT)); // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow if (isAllOnesConstant(N0)) return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0), DAG.getConstant(0, DL, CarryVT)); return SDValue(); } SDValue DAGCombiner::visitSUBE(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); // fold (sube x, y, false) -> (subc x, y) if (CarryIn.getOpcode() == ISD::CARRY_FALSE) return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1); return SDValue(); } SDValue DAGCombiner::visitSUBCARRY(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); // fold (subcarry x, y, false) -> (usubo x, y) if (isNullConstant(CarryIn)) { if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0))) return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1); } return SDValue(); } SDValue DAGCombiner::visitMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); // fold (mul x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); bool N0IsConst = false; bool N1IsConst = false; bool N1IsOpaqueConst = false; bool N0IsOpaqueConst = false; APInt ConstValue0, ConstValue1; // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0); N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1); assert((!N0IsConst || ConstValue0.getBitWidth() == VT.getScalarSizeInBits()) && "Splat APInt should be element width"); assert((!N1IsConst || ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) && "Splat APInt should be element width"); } else { N0IsConst = isa(N0); if (N0IsConst) { ConstValue0 = cast(N0)->getAPIntValue(); N0IsOpaqueConst = cast(N0)->isOpaque(); } N1IsConst = isa(N1); if (N1IsConst) { ConstValue1 = cast(N1)->getAPIntValue(); N1IsOpaqueConst = cast(N1)->isOpaque(); } } // fold (mul c1, c2) -> c1*c2 if (N0IsConst && N1IsConst && !N0IsOpaqueConst && !N1IsOpaqueConst) return DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, N0.getNode(), N1.getNode()); // canonicalize constant to RHS (vector doesn't have to splat) if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0); // fold (mul x, 0) -> 0 if (N1IsConst && ConstValue1.isNullValue()) return N1; // fold (mul x, 1) -> x if (N1IsConst && ConstValue1.isOneValue()) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (mul x, -1) -> 0-x if (N1IsConst && ConstValue1.isAllOnesValue()) { SDLoc DL(N); return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); } // fold (mul x, (1 << c)) -> x << c if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N1) && (!VT.isVector() || Level <= AfterLegalizeVectorOps)) { SDLoc DL(N); SDValue LogBase2 = BuildLogBase2(N1, DL); EVT ShiftVT = getShiftAmountTy(N0.getValueType()); SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc); } // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) { unsigned Log2Val = (-ConstValue1).logBase2(); SDLoc DL(N); // FIXME: If the input is something that is easily negated (e.g. a // single-use add), we should put the negate there. return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(Log2Val, DL, getShiftAmountTy(N0.getValueType())))); } // (mul (shl X, c1), c2) -> (mul X, c2 << c1) if (N0.getOpcode() == ISD::SHL && isConstantOrConstantVector(N1, /* NoOpaques */ true) && isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) { SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1)); if (isConstantOrConstantVector(C3)) return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3); } // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one // use. { SDValue Sh(nullptr, 0), Y(nullptr, 0); // Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)). if (N0.getOpcode() == ISD::SHL && isConstantOrConstantVector(N0.getOperand(1)) && N0.getNode()->hasOneUse()) { Sh = N0; Y = N1; } else if (N1.getOpcode() == ISD::SHL && isConstantOrConstantVector(N1.getOperand(1)) && N1.getNode()->hasOneUse()) { Sh = N1; Y = N0; } if (Sh.getNode()) { SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y); return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1)); } } // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2) if (DAG.isConstantIntBuildVectorOrConstantInt(N1) && N0.getOpcode() == ISD::ADD && DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) && isMulAddWithConstProfitable(N, N0, N1)) return DAG.getNode(ISD::ADD, SDLoc(N), VT, DAG.getNode(ISD::MUL, SDLoc(N0), VT, N0.getOperand(0), N1), DAG.getNode(ISD::MUL, SDLoc(N1), VT, N0.getOperand(1), N1)); // reassociate mul if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1)) return RMUL; return SDValue(); } /// Return true if divmod libcall is available. static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned, const TargetLowering &TLI) { RTLIB::Libcall LC; EVT NodeType = Node->getValueType(0); if (!NodeType.isSimple()) return false; switch (NodeType.getSimpleVT().SimpleTy) { default: return false; // No libcall for vector types. case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break; } return TLI.getLibcallName(LC) != nullptr; } /// Issue divrem if both quotient and remainder are needed. SDValue DAGCombiner::useDivRem(SDNode *Node) { if (Node->use_empty()) return SDValue(); // This is a dead node, leave it alone. unsigned Opcode = Node->getOpcode(); bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM); unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM; // DivMod lib calls can still work on non-legal types if using lib-calls. EVT VT = Node->getValueType(0); if (VT.isVector() || !VT.isInteger()) return SDValue(); if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT)) return SDValue(); // If DIVREM is going to get expanded into a libcall, // but there is no libcall available, then don't combine. if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) && !isDivRemLibcallAvailable(Node, isSigned, TLI)) return SDValue(); // If div is legal, it's better to do the normal expansion unsigned OtherOpcode = 0; if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) { OtherOpcode = isSigned ? ISD::SREM : ISD::UREM; if (TLI.isOperationLegalOrCustom(Opcode, VT)) return SDValue(); } else { OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV; if (TLI.isOperationLegalOrCustom(OtherOpcode, VT)) return SDValue(); } SDValue Op0 = Node->getOperand(0); SDValue Op1 = Node->getOperand(1); SDValue combined; for (SDNode::use_iterator UI = Op0.getNode()->use_begin(), UE = Op0.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User == Node || User->getOpcode() == ISD::DELETED_NODE || User->use_empty()) continue; // Convert the other matching node(s), too; // otherwise, the DIVREM may get target-legalized into something // target-specific that we won't be able to recognize. unsigned UserOpc = User->getOpcode(); if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) && User->getOperand(0) == Op0 && User->getOperand(1) == Op1) { if (!combined) { if (UserOpc == OtherOpcode) { SDVTList VTs = DAG.getVTList(VT, VT); combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1); } else if (UserOpc == DivRemOpc) { combined = SDValue(User, 0); } else { assert(UserOpc == Opcode); continue; } } if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV) CombineTo(User, combined); else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM) CombineTo(User, combined.getValue(1)); } } return combined; } static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); if (DAG.isUndef(N->getOpcode(), {N0, N1})) return DAG.getUNDEF(VT); // undef / X -> 0 // undef % X -> 0 if (N0.isUndef()) return DAG.getConstant(0, DL, VT); return SDValue(); } SDValue DAGCombiner::visitSDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); EVT CCVT = getSetCCResultType(VT); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; SDLoc DL(N); // fold (sdiv c1, c2) -> c1/c2 ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C && !N0C->isOpaque() && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, N0C, N1C); // fold (sdiv X, 1) -> X if (N1C && N1C->isOne()) return N0; // fold (sdiv X, -1) -> 0-X if (N1C && N1C->isAllOnesValue()) return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0); // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0) if (N1C && N1C->getAPIntValue().isMinSignedValue()) return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT)); if (SDValue V = simplifyDivRem(N, DAG)) return V; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // If we know the sign bits of both operands are zero, strength reduce to a // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1); if (SDValue V = visitSDIVLike(N0, N1, N)) return V; // sdiv, srem -> sdivrem // If the divisor is constant, then return DIVREM only if isIntDivCheap() is // true. Otherwise, we break the simplification logic in visitREM(). AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue DivRem = useDivRem(N)) return DivRem; return SDValue(); } SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) { SDLoc DL(N); EVT VT = N->getValueType(0); EVT CCVT = getSetCCResultType(VT); unsigned BitWidth = VT.getScalarSizeInBits(); ConstantSDNode *N1C = isConstOrConstSplat(N1); // Helper for determining whether a value is a power-2 constant scalar or a // vector of such elements. auto IsPowerOfTwo = [](ConstantSDNode *C) { if (C->isNullValue() || C->isOpaque()) return false; if (C->getAPIntValue().isPowerOf2()) return true; if ((-C->getAPIntValue()).isPowerOf2()) return true; return false; }; // fold (sdiv X, pow2) -> simple ops after legalize // FIXME: We check for the exact bit here because the generic lowering gives // better results in that case. The target-specific lowering should learn how // to handle exact sdivs efficiently. if (!N->getFlags().hasExact() && ISD::matchUnaryPredicate(N1C ? SDValue(N1C, 0) : N1, IsPowerOfTwo)) { // Target-specific implementation of sdiv x, pow2. if (SDValue Res = BuildSDIVPow2(N)) return Res; // Create constants that are functions of the shift amount value. EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType()); SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy); SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1); C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy); SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1); if (!isConstantOrConstantVector(Inexact)) return SDValue(); // Splat the sign bit into the register SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy)); AddToWorklist(Sign.getNode()); // Add (N0 < 0) ? abs2 - 1 : 0; SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact); AddToWorklist(Srl.getNode()); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl); AddToWorklist(Add.getNode()); SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1); AddToWorklist(Sra.getNode()); // Special case: (sdiv X, 1) -> X // Special Case: (sdiv X, -1) -> 0-X SDValue One = DAG.getConstant(1, DL, VT); SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ); SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ); SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes); Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra); // If dividing by a positive value, we're done. Otherwise, the result must // be negated. SDValue Zero = DAG.getConstant(0, DL, VT); SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra); // FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding. SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT); SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra); return Res; } // If integer divide is expensive and we satisfy the requirements, emit an // alternate sequence. Targets may check function attributes for size/speed // trade-offs. AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildSDIV(N)) return Op; return SDValue(); } SDValue DAGCombiner::visitUDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); EVT CCVT = getSetCCResultType(VT); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; SDLoc DL(N); // fold (udiv c1, c2) -> c1/c2 ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C) if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, N0C, N1C)) return Folded; // fold (udiv X, 1) -> X if (N1C && N1C->isOne()) return N0; // fold (udiv X, -1) -> select(X == -1, 1, 0) if (N1C && N1C->getAPIntValue().isAllOnesValue()) return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT)); if (SDValue V = simplifyDivRem(N, DAG)) return V; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; if (SDValue V = visitUDIVLike(N0, N1, N)) return V; // sdiv, srem -> sdivrem // If the divisor is constant, then return DIVREM only if isIntDivCheap() is // true. Otherwise, we break the simplification logic in visitREM(). AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue DivRem = useDivRem(N)) return DivRem; return SDValue(); } SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) { SDLoc DL(N); EVT VT = N->getValueType(0); ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (udiv x, (1 << c)) -> x >>u c if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N1)) { SDValue LogBase2 = BuildLogBase2(N1, DL); AddToWorklist(LogBase2.getNode()); EVT ShiftVT = getShiftAmountTy(N0.getValueType()); SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT); AddToWorklist(Trunc.getNode()); return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc); } // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2 if (N1.getOpcode() == ISD::SHL) { SDValue N10 = N1.getOperand(0); if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) && DAG.isKnownToBeAPowerOfTwo(N10)) { SDValue LogBase2 = BuildLogBase2(N10, DL); AddToWorklist(LogBase2.getNode()); EVT ADDVT = N1.getOperand(1).getValueType(); SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT); AddToWorklist(Trunc.getNode()); SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::SRL, DL, VT, N0, Add); } } // fold (udiv x, c) -> alternate AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr)) if (SDValue Op = BuildUDIV(N)) return Op; return SDValue(); } // handles ISD::SREM and ISD::UREM SDValue DAGCombiner::visitREM(SDNode *N) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); EVT CCVT = getSetCCResultType(VT); bool isSigned = (Opcode == ISD::SREM); SDLoc DL(N); // fold (rem c1, c2) -> c1%c2 ConstantSDNode *N0C = isConstOrConstSplat(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C) if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C)) return Folded; // fold (urem X, -1) -> select(X == -1, 0, x) if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue()) return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ), DAG.getConstant(0, DL, VT), N0); if (SDValue V = simplifyDivRem(N, DAG)) return V; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; if (isSigned) { // If we know the sign bits of both operands are zero, strength reduce to a // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UREM, DL, VT, N0, N1); } else { SDValue NegOne = DAG.getAllOnesConstant(DL, VT); if (DAG.isKnownToBeAPowerOfTwo(N1)) { // fold (urem x, pow2) -> (and x, pow2-1) SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } if (N1.getOpcode() == ISD::SHL && DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) { // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1)) SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0, Add); } } AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); // If X/C can be simplified by the division-by-constant logic, lower // X%C to the equivalent of X-X/C*C. // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the // speculative DIV must not cause a DIVREM conversion. We guard against this // by skipping the simplification if isIntDivCheap(). When div is not cheap, // combine will not return a DIVREM. Regardless, checking cheapness here // makes sense since the simplification results in fatter code. if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) { SDValue OptimizedDiv = isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N); if (OptimizedDiv.getNode() && OptimizedDiv.getOpcode() != ISD::UDIVREM && OptimizedDiv.getOpcode() != ISD::SDIVREM) { SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1); SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul); AddToWorklist(OptimizedDiv.getNode()); AddToWorklist(Mul.getNode()); return Sub; } } // sdiv, srem -> sdivrem if (SDValue DivRem = useDivRem(N)) return DivRem.getValue(1); return SDValue(); } SDValue DAGCombiner::visitMULHS(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); if (VT.isVector()) { // fold (mulhs x, 0) -> 0 if (ISD::isBuildVectorAllZeros(N1.getNode())) return N1; if (ISD::isBuildVectorAllZeros(N0.getNode())) return N0; } // fold (mulhs x, 0) -> 0 if (isNullConstant(N1)) return N1; // fold (mulhs x, 1) -> (sra x, size(x)-1) if (isOneConstant(N1)) return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0, DAG.getConstant(N0.getValueSizeInBits() - 1, DL, getShiftAmountTy(N0.getValueType()))); // fold (mulhs x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, DL, VT); // If the type twice as wide is legal, transform the mulhs to a wider multiply // plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0); N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1); N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, DAG.getConstant(SimpleSize, DL, getShiftAmountTy(N1.getValueType()))); return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); } } return SDValue(); } SDValue DAGCombiner::visitMULHU(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc DL(N); if (VT.isVector()) { // fold (mulhu x, 0) -> 0 if (ISD::isBuildVectorAllZeros(N1.getNode())) return N1; if (ISD::isBuildVectorAllZeros(N0.getNode())) return N0; } // fold (mulhu x, 0) -> 0 if (isNullConstant(N1)) return N1; // fold (mulhu x, 1) -> 0 if (isOneConstant(N1)) return DAG.getConstant(0, DL, N0.getValueType()); // fold (mulhu x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, DL, VT); // If the type twice as wide is legal, transform the mulhu to a wider multiply // plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1); N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1); N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1, DAG.getConstant(SimpleSize, DL, getShiftAmountTy(N1.getValueType()))); return DAG.getNode(ISD::TRUNCATE, DL, VT, N1); } } return SDValue(); } /// Perform optimizations common to nodes that compute two values. LoOp and HiOp /// give the opcodes for the two computations that are being performed. Return /// true if a simplification was made. SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp) { // If the high half is not needed, just compute the low half. bool HiExists = N->hasAnyUseOfValue(1); if (!HiExists && (!LegalOperations || TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) { SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); return CombineTo(N, Res, Res); } // If the low half is not needed, just compute the high half. bool LoExists = N->hasAnyUseOfValue(0); if (!LoExists && (!LegalOperations || TLI.isOperationLegal(HiOp, N->getValueType(1)))) { SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); return CombineTo(N, Res, Res); } // If both halves are used, return as it is. if (LoExists && HiExists) return SDValue(); // If the two computed results can be simplified separately, separate them. if (LoExists) { SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops()); AddToWorklist(Lo.getNode()); SDValue LoOpt = combine(Lo.getNode()); if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() && (!LegalOperations || TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType()))) return CombineTo(N, LoOpt, LoOpt); } if (HiExists) { SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops()); AddToWorklist(Hi.getNode()); SDValue HiOpt = combine(Hi.getNode()); if (HiOpt.getNode() && HiOpt != Hi && (!LegalOperations || TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType()))) return CombineTo(N, HiOpt, HiOpt); } return SDValue(); } SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) { if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS)) return Res; EVT VT = N->getValueType(0); SDLoc DL(N); // If the type is twice as wide is legal, transform the mulhu to a wider // multiply plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0)); SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1)); Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); // Compute the high part as N1. Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, DAG.getConstant(SimpleSize, DL, getShiftAmountTy(Lo.getValueType()))); Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); // Compute the low part as N0. Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); return CombineTo(N, Lo, Hi); } } return SDValue(); } SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) { if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU)) return Res; EVT VT = N->getValueType(0); SDLoc DL(N); // If the type is twice as wide is legal, transform the mulhu to a wider // multiply plus a shift. if (VT.isSimple() && !VT.isVector()) { MVT Simple = VT.getSimpleVT(); unsigned SimpleSize = Simple.getSizeInBits(); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2); if (TLI.isOperationLegal(ISD::MUL, NewVT)) { SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0)); SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1)); Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi); // Compute the high part as N1. Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo, DAG.getConstant(SimpleSize, DL, getShiftAmountTy(Lo.getValueType()))); Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi); // Compute the low part as N0. Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo); return CombineTo(N, Lo, Hi); } } return SDValue(); } SDValue DAGCombiner::visitSMULO(SDNode *N) { // (smulo x, 2) -> (saddo x, x) if (ConstantSDNode *C2 = dyn_cast(N->getOperand(1))) if (C2->getAPIntValue() == 2) return DAG.getNode(ISD::SADDO, SDLoc(N), N->getVTList(), N->getOperand(0), N->getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitUMULO(SDNode *N) { // (umulo x, 2) -> (uaddo x, x) if (ConstantSDNode *C2 = dyn_cast(N->getOperand(1))) if (C2->getAPIntValue() == 2) return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(), N->getOperand(0), N->getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitIMINMAX(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold operation with constant operands. ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); if (N0C && N1C) return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C); // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0); // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX. // Only do this if the current op isn't legal and the flipped is. unsigned Opcode = N->getOpcode(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegal(Opcode, VT) && (N0.isUndef() || DAG.SignBitIsZero(N0)) && (N1.isUndef() || DAG.SignBitIsZero(N1))) { unsigned AltOpcode; switch (Opcode) { case ISD::SMIN: AltOpcode = ISD::UMIN; break; case ISD::SMAX: AltOpcode = ISD::UMAX; break; case ISD::UMIN: AltOpcode = ISD::SMIN; break; case ISD::UMAX: AltOpcode = ISD::SMAX; break; default: llvm_unreachable("Unknown MINMAX opcode"); } if (TLI.isOperationLegal(AltOpcode, VT)) return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1); } return SDValue(); } /// If this is a binary operator with two operands of the same opcode, try to /// simplify it. SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) { SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); EVT VT = N0.getValueType(); assert(N0.getOpcode() == N1.getOpcode() && "Bad input!"); // Bail early if none of these transforms apply. if (N0.getNumOperands() == 0) return SDValue(); // For each of OP in AND/OR/XOR: // fold (OP (zext x), (zext y)) -> (zext (OP x, y)) // fold (OP (sext x), (sext y)) -> (sext (OP x, y)) // fold (OP (aext x), (aext y)) -> (aext (OP x, y)) // fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y)) // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free) // // do not sink logical op inside of a vector extend, since it may combine // into a vsetcc. EVT Op0VT = N0.getOperand(0).getValueType(); if ((N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::BSWAP || // Avoid infinite looping with PromoteIntBinOp. (N0.getOpcode() == ISD::ANY_EXTEND && (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) || (N0.getOpcode() == ISD::TRUNCATE && (!TLI.isZExtFree(VT, Op0VT) || !TLI.isTruncateFree(Op0VT, VT)) && TLI.isTypeLegal(Op0VT))) && !VT.isVector() && Op0VT == N1.getOperand(0).getValueType() && (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) { SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0), N0.getOperand(0).getValueType(), N0.getOperand(0), N1.getOperand(0)); AddToWorklist(ORNode.getNode()); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode); } // For each of OP in SHL/SRL/SRA/AND... // fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z) // fold (or (OP x, z), (OP y, z)) -> (OP (or x, y), z) // fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z) if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) && N0.getOperand(1) == N1.getOperand(1)) { SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0), N0.getOperand(0).getValueType(), N0.getOperand(0), N1.getOperand(0)); AddToWorklist(ORNode.getNode()); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode, N0.getOperand(1)); } // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B)) // Only perform this optimization up until type legalization, before // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and // we don't want to undo this promotion. // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper // on scalars. if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR) && Level <= AfterLegalizeTypes) { SDValue In0 = N0.getOperand(0); SDValue In1 = N1.getOperand(0); EVT In0Ty = In0.getValueType(); EVT In1Ty = In1.getValueType(); SDLoc DL(N); // If both incoming values are integers, and the original types are the // same. if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) { SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1); SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op); AddToWorklist(Op.getNode()); return BC; } } // Xor/and/or are indifferent to the swizzle operation (shuffle of one value). // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B)) // If both shuffles use the same mask, and both shuffle within a single // vector, then it is worthwhile to move the swizzle after the operation. // The type-legalizer generates this pattern when loading illegal // vector types from memory. In many cases this allows additional shuffle // optimizations. // There are other cases where moving the shuffle after the xor/and/or // is profitable even if shuffles don't perform a swizzle. // If both shuffles use the same mask, and both shuffles have the same first // or second operand, then it might still be profitable to move the shuffle // after the xor/and/or operation. if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) { ShuffleVectorSDNode *SVN0 = cast(N0); ShuffleVectorSDNode *SVN1 = cast(N1); assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() && "Inputs to shuffles are not the same type"); // Check that both shuffles use the same mask. The masks are known to be of // the same length because the result vector type is the same. // Check also that shuffles have only one use to avoid introducing extra // instructions. if (SVN0->hasOneUse() && SVN1->hasOneUse() && SVN0->getMask().equals(SVN1->getMask())) { SDValue ShOp = N0->getOperand(1); // Don't try to fold this node if it requires introducing a // build vector of all zeros that might be illegal at this stage. if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) { if (!LegalTypes) ShOp = DAG.getConstant(0, SDLoc(N), VT); else ShOp = SDValue(); } // (AND (shuf (A, C), shuf (B, C))) -> shuf (AND (A, B), C) // (OR (shuf (A, C), shuf (B, C))) -> shuf (OR (A, B), C) // (XOR (shuf (A, C), shuf (B, C))) -> shuf (XOR (A, B), V_0) if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) { SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0->getOperand(0), N1->getOperand(0)); AddToWorklist(NewNode.getNode()); return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp, SVN0->getMask()); } // Don't try to fold this node if it requires introducing a // build vector of all zeros that might be illegal at this stage. ShOp = N0->getOperand(0); if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) { if (!LegalTypes) ShOp = DAG.getConstant(0, SDLoc(N), VT); else ShOp = SDValue(); } // (AND (shuf (C, A), shuf (C, B))) -> shuf (C, AND (A, B)) // (OR (shuf (C, A), shuf (C, B))) -> shuf (C, OR (A, B)) // (XOR (shuf (C, A), shuf (C, B))) -> shuf (V_0, XOR (A, B)) if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) { SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0->getOperand(1), N1->getOperand(1)); AddToWorklist(NewNode.getNode()); return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode, SVN0->getMask()); } } } return SDValue(); } /// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient. SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1, const SDLoc &DL) { SDValue LL, LR, RL, RR, N0CC, N1CC; if (!isSetCCEquivalent(N0, LL, LR, N0CC) || !isSetCCEquivalent(N1, RL, RR, N1CC)) return SDValue(); assert(N0.getValueType() == N1.getValueType() && "Unexpected operand types for bitwise logic op"); assert(LL.getValueType() == LR.getValueType() && RL.getValueType() == RR.getValueType() && "Unexpected operand types for setcc"); // If we're here post-legalization or the logic op type is not i1, the logic // op type must match a setcc result type. Also, all folds require new // operations on the left and right operands, so those types must match. EVT VT = N0.getValueType(); EVT OpVT = LL.getValueType(); if (LegalOperations || VT.getScalarType() != MVT::i1) if (VT != getSetCCResultType(OpVT)) return SDValue(); if (OpVT != RL.getValueType()) return SDValue(); ISD::CondCode CC0 = cast(N0CC)->get(); ISD::CondCode CC1 = cast(N1CC)->get(); bool IsInteger = OpVT.isInteger(); if (LR == RR && CC0 == CC1 && IsInteger) { bool IsZero = isNullConstantOrNullSplatConstant(LR); bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR); // All bits clear? bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero; // All sign bits clear? bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1; // Any bits set? bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero; // Any sign bits set? bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero; // (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0) // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1) // (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0) // (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0) if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) { SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL); AddToWorklist(Or.getNode()); return DAG.getSetCC(DL, VT, Or, LR, CC1); } // All bits set? bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1; // All sign bits set? bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero; // Any bits clear? bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1; // Any sign bits clear? bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1; // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1) // (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0) // (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1) // (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1) if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) { SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL); AddToWorklist(And.getNode()); return DAG.getSetCC(DL, VT, And, LR, CC1); } } // TODO: What is the 'or' equivalent of this fold? // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2) if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 && IsInteger && CC0 == ISD::SETNE && ((isNullConstant(LR) && isAllOnesConstant(RR)) || (isAllOnesConstant(LR) && isNullConstant(RR)))) { SDValue One = DAG.getConstant(1, DL, OpVT); SDValue Two = DAG.getConstant(2, DL, OpVT); SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One); AddToWorklist(Add.getNode()); return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE); } // Try more general transforms if the predicates match and the only user of // the compares is the 'and' or 'or'. if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 && N0.hasOneUse() && N1.hasOneUse()) { // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0 // or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0 if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) { SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR); SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR); SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR); SDValue Zero = DAG.getConstant(0, DL, OpVT); return DAG.getSetCC(DL, VT, Or, Zero, CC1); } } // Canonicalize equivalent operands to LL == RL. if (LL == RR && LR == RL) { CC1 = ISD::getSetCCSwappedOperands(CC1); std::swap(RL, RR); } // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC) if (LL == RL && LR == RR) { ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger) : ISD::getSetCCOrOperation(CC0, CC1, IsInteger); if (NewCC != ISD::SETCC_INVALID && (!LegalOperations || (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) && TLI.isOperationLegal(ISD::SETCC, OpVT)))) return DAG.getSetCC(DL, VT, LL, LR, NewCC); } return SDValue(); } /// This contains all DAGCombine rules which reduce two values combined by /// an And operation to a single value. This makes them reusable in the context /// of visitSELECT(). Rules involving constants are not included as /// visitSELECT() already handles those cases. SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N1.getValueType(); SDLoc DL(N); // fold (and x, undef) -> 0 if (N0.isUndef() || N1.isUndef()) return DAG.getConstant(0, DL, VT); if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL)) return V; if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL && VT.getSizeInBits() <= 64) { if (ConstantSDNode *ADDI = dyn_cast(N0.getOperand(1))) { if (ConstantSDNode *SRLI = dyn_cast(N1.getOperand(1))) { // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal // immediate for an add, but it is legal if its top c2 bits are set, // transform the ADD so the immediate doesn't need to be materialized // in a register. APInt ADDC = ADDI->getAPIntValue(); APInt SRLC = SRLI->getAPIntValue(); if (ADDC.getMinSignedBits() <= 64 && SRLC.ult(VT.getSizeInBits()) && !TLI.isLegalAddImmediate(ADDC.getSExtValue())) { APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(), SRLC.getZExtValue()); if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) { ADDC |= Mask; if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) { SDLoc DL0(N0); SDValue NewAdd = DAG.getNode(ISD::ADD, DL0, VT, N0.getOperand(0), DAG.getConstant(ADDC, DL, VT)); CombineTo(N0.getNode(), NewAdd); // Return N so it doesn't get rechecked! return SDValue(N, 0); } } } } } } // Reduce bit extract of low half of an integer to the narrower type. // (and (srl i64:x, K), KMask) -> // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask) if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { if (ConstantSDNode *CAnd = dyn_cast(N1)) { if (ConstantSDNode *CShift = dyn_cast(N0.getOperand(1))) { unsigned Size = VT.getSizeInBits(); const APInt &AndMask = CAnd->getAPIntValue(); unsigned ShiftBits = CShift->getZExtValue(); // Bail out, this node will probably disappear anyway. if (ShiftBits == 0) return SDValue(); unsigned MaskBits = AndMask.countTrailingOnes(); EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2); if (AndMask.isMask() && // Required bits must not span the two halves of the integer and // must fit in the half size type. (ShiftBits + MaskBits <= Size / 2) && TLI.isNarrowingProfitable(VT, HalfVT) && TLI.isTypeDesirableForOp(ISD::AND, HalfVT) && TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) && TLI.isTruncateFree(VT, HalfVT) && TLI.isZExtFree(HalfVT, VT)) { // The isNarrowingProfitable is to avoid regressions on PPC and // AArch64 which match a few 64-bit bit insert / bit extract patterns // on downstream users of this. Those patterns could probably be // extended to handle extensions mixed in. SDValue SL(N0); assert(MaskBits <= Size); // Extracting the highest bit of the low half. EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout()); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT, N0.getOperand(0)); SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT); SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT); SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK); SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask); return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And); } } } } return SDValue(); } bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN, EVT LoadResultTy, EVT &ExtVT) { if (!AndC->getAPIntValue().isMask()) return false; unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); EVT LoadedVT = LoadN->getMemoryVT(); if (ExtVT == LoadedVT && (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) { // ZEXTLOAD will match without needing to change the size of the value being // loaded. return true; } // Do not change the width of a volatile load. if (LoadN->isVolatile()) return false; // Do not generate loads of non-round integer types since these can // be expensive (and would be wrong if the type is not byte sized). if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound()) return false; if (LegalOperations && !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT)) return false; if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT)) return false; return true; } bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST, ISD::LoadExtType ExtType, EVT &MemVT, unsigned ShAmt) { if (!LDST) return false; // Only allow byte offsets. if (ShAmt % 8) return false; // Do not generate loads of non-round integer types since these can // be expensive (and would be wrong if the type is not byte sized). if (!MemVT.isRound()) return false; // Don't change the width of a volatile load. if (LDST->isVolatile()) return false; // Verify that we are actually reducing a load width here. if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits()) return false; // Ensure that this isn't going to produce an unsupported unaligned access. if (ShAmt && !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, LDST->getAddressSpace(), ShAmt / 8)) return false; // It's not possible to generate a constant of extended or untyped type. EVT PtrType = LDST->getBasePtr().getValueType(); if (PtrType == MVT::Untyped || PtrType.isExtended()) return false; if (isa(LDST)) { LoadSDNode *Load = cast(LDST); // Don't transform one with multiple uses, this would require adding a new // load. if (!SDValue(Load, 0).hasOneUse()) return false; if (LegalOperations && !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT)) return false; // For the transform to be legal, the load must produce only two values // (the value loaded and the chain). Don't transform a pre-increment // load, for example, which produces an extra value. Otherwise the // transformation is not equivalent, and the downstream logic to replace // uses gets things wrong. if (Load->getNumValues() > 2) return false; // If the load that we're shrinking is an extload and we're not just // discarding the extension we can't simply shrink the load. Bail. // TODO: It would be possible to merge the extensions in some cases. if (Load->getExtensionType() != ISD::NON_EXTLOAD && Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt) return false; if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT)) return false; } else { assert(isa(LDST) && "It is not a Load nor a Store SDNode"); StoreSDNode *Store = cast(LDST); // Can't write outside the original store if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt) return false; if (LegalOperations && !TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT)) return false; } return true; } bool DAGCombiner::SearchForAndLoads(SDNode *N, SmallPtrSetImpl &Loads, SmallPtrSetImpl &NodesWithConsts, ConstantSDNode *Mask, SDNode *&NodeToMask) { // Recursively search for the operands, looking for loads which can be // narrowed. for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) { SDValue Op = N->getOperand(i); if (Op.getValueType().isVector()) return false; // Some constants may need fixing up later if they are too large. if (auto *C = dyn_cast(Op)) { if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) && (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue()) NodesWithConsts.insert(N); continue; } if (!Op.hasOneUse()) return false; switch(Op.getOpcode()) { case ISD::LOAD: { auto *Load = cast(Op); EVT ExtVT; if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) && isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) { // ZEXTLOAD is already small enough. if (Load->getExtensionType() == ISD::ZEXTLOAD && ExtVT.bitsGE(Load->getMemoryVT())) continue; // Use LE to convert equal sized loads to zext. if (ExtVT.bitsLE(Load->getMemoryVT())) Loads.insert(Load); continue; } return false; } case ISD::ZERO_EXTEND: case ISD::AssertZext: { unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes(); EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); EVT VT = Op.getOpcode() == ISD::AssertZext ? cast(Op.getOperand(1))->getVT() : Op.getOperand(0).getValueType(); // We can accept extending nodes if the mask is wider or an equal // width to the original type. if (ExtVT.bitsGE(VT)) continue; break; } case ISD::OR: case ISD::XOR: case ISD::AND: if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask, NodeToMask)) return false; continue; } // Allow one node which will masked along with any loads found. if (NodeToMask) return false; // Also ensure that the node to be masked only produces one data result. NodeToMask = Op.getNode(); if (NodeToMask->getNumValues() > 1) { bool HasValue = false; for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) { MVT VT = SDValue(NodeToMask, i).getSimpleValueType(); if (VT != MVT::Glue && VT != MVT::Other) { if (HasValue) { NodeToMask = nullptr; return false; } HasValue = true; } } assert(HasValue && "Node to be masked has no data result?"); } } return true; } bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) { auto *Mask = dyn_cast(N->getOperand(1)); if (!Mask) return false; if (!Mask->getAPIntValue().isMask()) return false; // No need to do anything if the and directly uses a load. if (isa(N->getOperand(0))) return false; SmallPtrSet Loads; SmallPtrSet NodesWithConsts; SDNode *FixupNode = nullptr; if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) { if (Loads.size() == 0) return false; LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump()); SDValue MaskOp = N->getOperand(1); // If it exists, fixup the single node we allow in the tree that needs // masking. if (FixupNode) { LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump()); SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode), FixupNode->getValueType(0), SDValue(FixupNode, 0), MaskOp); DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And); if (And.getOpcode() == ISD ::AND) DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp); } // Narrow any constants that need it. for (auto *LogicN : NodesWithConsts) { SDValue Op0 = LogicN->getOperand(0); SDValue Op1 = LogicN->getOperand(1); if (isa(Op0)) std::swap(Op0, Op1); SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(), Op1, MaskOp); DAG.UpdateNodeOperands(LogicN, Op0, And); } // Create narrow loads. for (auto *Load : Loads) { LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump()); SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0), SDValue(Load, 0), MaskOp); DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And); if (And.getOpcode() == ISD ::AND) And = SDValue( DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0); SDValue NewLoad = ReduceLoadWidth(And.getNode()); assert(NewLoad && "Shouldn't be masking the load if it can't be narrowed"); CombineTo(Load, NewLoad, NewLoad.getValue(1)); } DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode()); return true; } return false; } // Unfold // x & (-1 'logical shift' y) // To // (x 'opposite logical shift' y) 'logical shift' y // if it is better for performance. SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) { assert(N->getOpcode() == ISD::AND); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Do we actually prefer shifts over mask? if (!TLI.preferShiftsToClearExtremeBits(N0)) return SDValue(); // Try to match (-1 '[outer] logical shift' y) unsigned OuterShift; unsigned InnerShift; // The opposite direction to the OuterShift. SDValue Y; // Shift amount. auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool { if (!M.hasOneUse()) return false; OuterShift = M->getOpcode(); if (OuterShift == ISD::SHL) InnerShift = ISD::SRL; else if (OuterShift == ISD::SRL) InnerShift = ISD::SHL; else return false; if (!isAllOnesConstant(M->getOperand(0))) return false; Y = M->getOperand(1); return true; }; SDValue X; if (matchMask(N1)) X = N0; else if (matchMask(N0)) X = N1; else return SDValue(); SDLoc DL(N); EVT VT = N->getValueType(0); // tmp = x 'opposite logical shift' y SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y); // ret = tmp 'logical shift' y SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y); return T1; } SDValue DAGCombiner::visitAND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N1.getValueType(); // x & x --> x if (N0 == N1) return N0; // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (and x, 0) -> 0, vector edition if (ISD::isBuildVectorAllZeros(N0.getNode())) // do not return N0, because undef node may exist in N0 return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()), SDLoc(N), N0.getValueType()); if (ISD::isBuildVectorAllZeros(N1.getNode())) // do not return N1, because undef node may exist in N1 return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()), SDLoc(N), N1.getValueType()); // fold (and x, -1) -> x, vector edition if (ISD::isBuildVectorAllOnes(N0.getNode())) return N1; if (ISD::isBuildVectorAllOnes(N1.getNode())) return N0; } // fold (and c1, c2) -> c1&c2 ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); ConstantSDNode *N1C = isConstOrConstSplat(N1); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, N0C, N1C); // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0); // fold (and x, -1) -> x if (isAllOnesConstant(N1)) return N0; // if (and x, c) is known to be zero, return 0 unsigned BitWidth = VT.getScalarSizeInBits(); if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(BitWidth))) return DAG.getConstant(0, SDLoc(N), VT); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // reassociate and if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1)) return RAND; // Try to convert a constant mask AND into a shuffle clear mask. if (VT.isVector()) if (SDValue Shuffle = XformToShuffleWithZero(N)) return Shuffle; // fold (and (or x, C), D) -> D if (C & D) == D auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue()); }; if (N0.getOpcode() == ISD::OR && ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset)) return N1; // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits. if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { SDValue N0Op0 = N0.getOperand(0); APInt Mask = ~N1C->getAPIntValue(); Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits()); if (DAG.MaskedValueIsZero(N0Op0, Mask)) { SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N0.getValueType(), N0Op0); // Replace uses of the AND with uses of the Zero extend node. CombineTo(N, Zext); // We actually want to replace all uses of the any_extend with the // zero_extend, to avoid duplicating things. This will later cause this // AND to be folded. CombineTo(N0.getNode(), Zext); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) -> // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must // already be zero by virtue of the width of the base type of the load. // // the 'X' node here can either be nothing or an extract_vector_elt to catch // more cases. if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() && N0.getOperand(0).getOpcode() == ISD::LOAD && N0.getOperand(0).getResNo() == 0) || (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) { LoadSDNode *Load = cast( (N0.getOpcode() == ISD::LOAD) ? N0 : N0.getOperand(0) ); // Get the constant (if applicable) the zero'th operand is being ANDed with. // This can be a pure constant or a vector splat, in which case we treat the // vector as a scalar and use the splat value. APInt Constant = APInt::getNullValue(1); if (const ConstantSDNode *C = dyn_cast(N1)) { Constant = C->getAPIntValue(); } else if (BuildVectorSDNode *Vector = dyn_cast(N1)) { APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs); if (IsSplat) { // Undef bits can contribute to a possible optimisation if set, so // set them. SplatValue |= SplatUndef; // The splat value may be something like "0x00FFFFFF", which means 0 for // the first vector value and FF for the rest, repeating. We need a mask // that will apply equally to all members of the vector, so AND all the // lanes of the constant together. EVT VT = Vector->getValueType(0); unsigned BitWidth = VT.getScalarSizeInBits(); // If the splat value has been compressed to a bitlength lower // than the size of the vector lane, we need to re-expand it to // the lane size. if (BitWidth > SplatBitSize) for (SplatValue = SplatValue.zextOrTrunc(BitWidth); SplatBitSize < BitWidth; SplatBitSize = SplatBitSize * 2) SplatValue |= SplatValue.shl(SplatBitSize); // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value. if (SplatBitSize % BitWidth == 0) { Constant = APInt::getAllOnesValue(BitWidth); for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i) Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth); } } } // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is // actually legal and isn't going to get expanded, else this is a false // optimisation. bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD, Load->getValueType(0), Load->getMemoryVT()); // Resize the constant to the same size as the original memory access before // extension. If it is still the AllOnesValue then this AND is completely // unneeded. Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits()); bool B; switch (Load->getExtensionType()) { default: B = false; break; case ISD::EXTLOAD: B = CanZextLoadProfitably; break; case ISD::ZEXTLOAD: case ISD::NON_EXTLOAD: B = true; break; } if (B && Constant.isAllOnesValue()) { // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to // preserve semantics once we get rid of the AND. SDValue NewLoad(Load, 0); // Fold the AND away. NewLoad may get replaced immediately. CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0); if (Load->getExtensionType() == ISD::EXTLOAD) { NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD, Load->getValueType(0), SDLoc(Load), Load->getChain(), Load->getBasePtr(), Load->getOffset(), Load->getMemoryVT(), Load->getMemOperand()); // Replace uses of the EXTLOAD with the new ZEXTLOAD. if (Load->getNumValues() == 3) { // PRE/POST_INC loads have 3 values. SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1), NewLoad.getValue(2) }; CombineTo(Load, To, 3, true); } else { CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1)); } } return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (and (load x), 255) -> (zextload x, i8) // fold (and (extload x, i16), 255) -> (zextload x, i8) // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8) if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD || (N0.getOpcode() == ISD::ANY_EXTEND && N0.getOperand(0).getOpcode() == ISD::LOAD))) { if (SDValue Res = ReduceLoadWidth(N)) { LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND ? cast(N0.getOperand(0)) : cast(N0); AddToWorklist(N); CombineTo(LN0, Res, Res.getValue(1)); return SDValue(N, 0); } } if (Level >= AfterLegalizeTypes) { // Attempt to propagate the AND back up to the leaves which, if they're // loads, can be combined to narrow loads and the AND node can be removed. // Perform after legalization so that extend nodes will already be // combined into the loads. if (BackwardsPropagateMask(N, DAG)) { return SDValue(N, 0); } } if (SDValue Combined = visitANDLike(N0, N1, N)) return Combined; // Simplify: (and (op x...), (op y...)) -> (op (and x, y)) if (N0.getOpcode() == N1.getOpcode()) if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) return Tmp; // Masking the negated extension of a boolean is just the zero-extended // boolean: // and (sub 0, zext(bool X)), 1 --> zext(bool X) // and (sub 0, sext(bool X)), 1 --> zext(bool X) // // Note: the SimplifyDemandedBits fold below can make an information-losing // transform, and then we have no way to find this better fold. if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) { if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) { SDValue SubRHS = N0.getOperand(1); if (SubRHS.getOpcode() == ISD::ZERO_EXTEND && SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) return SubRHS; if (SubRHS.getOpcode() == ISD::SIGN_EXTEND && SubRHS.getOperand(0).getScalarValueSizeInBits() == 1) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0)); } } // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1) // fold (and (sra)) -> (and (srl)) when possible. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (zext_inreg (extload x)) -> (zextload x) if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); // If we zero all the possible extended bits, then we can turn this into // a zextload if we are running before legalize or the operation is legal. unsigned BitWidth = N1.getScalarValueSizeInBits(); if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth, BitWidth - MemVT.getScalarSizeInBits())) && ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); AddToWorklist(N); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); // If we zero all the possible extended bits, then we can turn this into // a zextload if we are running before legalize or the operation is legal. unsigned BitWidth = N1.getScalarValueSizeInBits(); if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth, BitWidth - MemVT.getScalarSizeInBits())) && ((!LegalOperations && !LN0->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); AddToWorklist(N); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const) if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) { if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), N0.getOperand(1), false)) return BSwap; } if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N)) return Shifts; return SDValue(); } /// Match (a >> 8) | (a << 8) as (bswap a) >> 16. SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1, bool DemandHighBits) { if (!LegalOperations) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16) return SDValue(); if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) return SDValue(); // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff) bool LookPassAnd0 = false; bool LookPassAnd1 = false; if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL) std::swap(N0, N1); if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() == ISD::AND) { if (!N0.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N01C = dyn_cast(N0.getOperand(1)); // Also handle 0xffff since the LHS is guaranteed to have zeros there. // This is needed for X86. if (!N01C || (N01C->getZExtValue() != 0xFF00 && N01C->getZExtValue() != 0xFFFF)) return SDValue(); N0 = N0.getOperand(0); LookPassAnd0 = true; } if (N1.getOpcode() == ISD::AND) { if (!N1.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N11C || N11C->getZExtValue() != 0xFF) return SDValue(); N1 = N1.getOperand(0); LookPassAnd1 = true; } if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL) std::swap(N0, N1); if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL) return SDValue(); if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N01C = dyn_cast(N0.getOperand(1)); ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N01C || !N11C) return SDValue(); if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8) return SDValue(); // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8) SDValue N00 = N0->getOperand(0); if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) { if (!N00.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N001C = dyn_cast(N00.getOperand(1)); if (!N001C || N001C->getZExtValue() != 0xFF) return SDValue(); N00 = N00.getOperand(0); LookPassAnd0 = true; } SDValue N10 = N1->getOperand(0); if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) { if (!N10.getNode()->hasOneUse()) return SDValue(); ConstantSDNode *N101C = dyn_cast(N10.getOperand(1)); // Also allow 0xFFFF since the bits will be shifted out. This is needed // for X86. if (!N101C || (N101C->getZExtValue() != 0xFF00 && N101C->getZExtValue() != 0xFFFF)) return SDValue(); N10 = N10.getOperand(0); LookPassAnd1 = true; } if (N00 != N10) return SDValue(); // Make sure everything beyond the low halfword gets set to zero since the SRL // 16 will clear the top bits. unsigned OpSizeInBits = VT.getSizeInBits(); if (DemandHighBits && OpSizeInBits > 16) { // If the left-shift isn't masked out then the only way this is a bswap is // if all bits beyond the low 8 are 0. In that case the entire pattern // reduces to a left shift anyway: leave it for other parts of the combiner. if (!LookPassAnd0) return SDValue(); // However, if the right shift isn't masked out then it might be because // it's not needed. See if we can spot that too. if (!LookPassAnd1 && !DAG.MaskedValueIsZero( N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16))) return SDValue(); } SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00); if (OpSizeInBits > 16) { SDLoc DL(N); Res = DAG.getNode(ISD::SRL, DL, VT, Res, DAG.getConstant(OpSizeInBits - 16, DL, getShiftAmountTy(VT))); } return Res; } /// Return true if the specified node is an element that makes up a 32-bit /// packed halfword byteswap. /// ((x & 0x000000ff) << 8) | /// ((x & 0x0000ff00) >> 8) | /// ((x & 0x00ff0000) << 8) | /// ((x & 0xff000000) >> 8) static bool isBSwapHWordElement(SDValue N, MutableArrayRef Parts) { if (!N.getNode()->hasOneUse()) return false; unsigned Opc = N.getOpcode(); if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL) return false; SDValue N0 = N.getOperand(0); unsigned Opc0 = N0.getOpcode(); if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL) return false; ConstantSDNode *N1C = nullptr; // SHL or SRL: look upstream for AND mask operand if (Opc == ISD::AND) N1C = dyn_cast(N.getOperand(1)); else if (Opc0 == ISD::AND) N1C = dyn_cast(N0.getOperand(1)); if (!N1C) return false; unsigned MaskByteOffset; switch (N1C->getZExtValue()) { default: return false; case 0xFF: MaskByteOffset = 0; break; case 0xFF00: MaskByteOffset = 1; break; case 0xFFFF: // In case demanded bits didn't clear the bits that will be shifted out. // This is needed for X86. if (Opc == ISD::SRL || (Opc == ISD::AND && Opc0 == ISD::SHL)) { MaskByteOffset = 1; break; } return false; case 0xFF0000: MaskByteOffset = 2; break; case 0xFF000000: MaskByteOffset = 3; break; } // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00). if (Opc == ISD::AND) { if (MaskByteOffset == 0 || MaskByteOffset == 2) { // (x >> 8) & 0xff // (x >> 8) & 0xff0000 if (Opc0 != ISD::SRL) return false; ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } else { // (x << 8) & 0xff00 // (x << 8) & 0xff000000 if (Opc0 != ISD::SHL) return false; ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } } else if (Opc == ISD::SHL) { // (x & 0xff) << 8 // (x & 0xff0000) << 8 if (MaskByteOffset != 0 && MaskByteOffset != 2) return false; ConstantSDNode *C = dyn_cast(N.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } else { // Opc == ISD::SRL // (x & 0xff00) >> 8 // (x & 0xff000000) >> 8 if (MaskByteOffset != 1 && MaskByteOffset != 3) return false; ConstantSDNode *C = dyn_cast(N.getOperand(1)); if (!C || C->getZExtValue() != 8) return false; } if (Parts[MaskByteOffset]) return false; Parts[MaskByteOffset] = N0.getOperand(0).getNode(); return true; } /// Match a 32-bit packed halfword bswap. That is /// ((x & 0x000000ff) << 8) | /// ((x & 0x0000ff00) >> 8) | /// ((x & 0x00ff0000) << 8) | /// ((x & 0xff000000) >> 8) /// => (rotl (bswap x), 16) SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) { if (!LegalOperations) return SDValue(); EVT VT = N->getValueType(0); if (VT != MVT::i32) return SDValue(); if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT)) return SDValue(); // Look for either // (or (or (and), (and)), (or (and), (and))) // (or (or (or (and), (and)), (and)), (and)) if (N0.getOpcode() != ISD::OR) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); SDNode *Parts[4] = {}; if (N1.getOpcode() == ISD::OR && N00.getNumOperands() == 2 && N01.getNumOperands() == 2) { // (or (or (and), (and)), (or (and), (and))) if (!isBSwapHWordElement(N00, Parts)) return SDValue(); if (!isBSwapHWordElement(N01, Parts)) return SDValue(); SDValue N10 = N1.getOperand(0); if (!isBSwapHWordElement(N10, Parts)) return SDValue(); SDValue N11 = N1.getOperand(1); if (!isBSwapHWordElement(N11, Parts)) return SDValue(); } else { // (or (or (or (and), (and)), (and)), (and)) if (!isBSwapHWordElement(N1, Parts)) return SDValue(); if (!isBSwapHWordElement(N01, Parts)) return SDValue(); if (N00.getOpcode() != ISD::OR) return SDValue(); SDValue N000 = N00.getOperand(0); if (!isBSwapHWordElement(N000, Parts)) return SDValue(); SDValue N001 = N00.getOperand(1); if (!isBSwapHWordElement(N001, Parts)) return SDValue(); } // Make sure the parts are all coming from the same node. if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3]) return SDValue(); SDLoc DL(N); SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, SDValue(Parts[0], 0)); // Result of the bswap should be rotated by 16. If it's not legal, then // do (x << 16) | (x >> 16). SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT)); if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT)) return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt); if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT)) return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt); return DAG.getNode(ISD::OR, DL, VT, DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt), DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt)); } /// This contains all DAGCombine rules which reduce two values combined by /// an Or operation to a single value \see visitANDLike(). SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) { EVT VT = N1.getValueType(); SDLoc DL(N); // fold (or x, undef) -> -1 if (!LegalOperations && (N0.isUndef() || N1.isUndef())) return DAG.getAllOnesConstant(DL, VT); if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL)) return V; // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible. if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && // Don't increase # computations. (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { // We can only do this xform if we know that bits from X that are set in C2 // but not in C1 are already zero. Likewise for Y. if (const ConstantSDNode *N0O1C = getAsNonOpaqueConstant(N0.getOperand(1))) { if (const ConstantSDNode *N1O1C = getAsNonOpaqueConstant(N1.getOperand(1))) { // We can only do this xform if we know that bits from X that are set in // C2 but not in C1 are already zero. Likewise for Y. const APInt &LHSMask = N0O1C->getAPIntValue(); const APInt &RHSMask = N1O1C->getAPIntValue(); if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) && DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1.getOperand(0)); return DAG.getNode(ISD::AND, DL, VT, X, DAG.getConstant(LHSMask | RHSMask, DL, VT)); } } } } // (or (and X, M), (and X, N)) -> (and X, (or M, N)) if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND && N0.getOperand(0) == N1.getOperand(0) && // Don't increase # computations. (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) { SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(1), N1.getOperand(1)); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X); } return SDValue(); } SDValue DAGCombiner::visitOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N1.getValueType(); // x | x --> x if (N0 == N1) return N0; // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (or x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N0.getNode())) return N1; if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; // fold (or x, -1) -> -1, vector edition if (ISD::isBuildVectorAllOnes(N0.getNode())) // do not return N0, because undef node may exist in N0 return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType()); if (ISD::isBuildVectorAllOnes(N1.getNode())) // do not return N1, because undef node may exist in N1 return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType()); // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask) // Do this only if the resulting shuffle is legal. if (isa(N0) && isa(N1) && // Avoid folding a node with illegal type. TLI.isTypeLegal(VT)) { bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode()); bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()); bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode()); // Ensure both shuffles have a zero input. if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) { assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!"); assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!"); const ShuffleVectorSDNode *SV0 = cast(N0); const ShuffleVectorSDNode *SV1 = cast(N1); bool CanFold = true; int NumElts = VT.getVectorNumElements(); SmallVector Mask(NumElts); for (int i = 0; i != NumElts; ++i) { int M0 = SV0->getMaskElt(i); int M1 = SV1->getMaskElt(i); // Determine if either index is pointing to a zero vector. bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts)); bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts)); // If one element is zero and the otherside is undef, keep undef. // This also handles the case that both are undef. if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0)) { Mask[i] = -1; continue; } // Make sure only one of the elements is zero. if (M0Zero == M1Zero) { CanFold = false; break; } assert((M0 >= 0 || M1 >= 0) && "Undef index!"); // We have a zero and non-zero element. If the non-zero came from // SV0 make the index a LHS index. If it came from SV1, make it // a RHS index. We need to mod by NumElts because we don't care // which operand it came from in the original shuffles. Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts; } if (CanFold) { SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0); SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0); bool LegalMask = TLI.isShuffleMaskLegal(Mask, VT); if (!LegalMask) { std::swap(NewLHS, NewRHS); ShuffleVectorSDNode::commuteMask(Mask); LegalMask = TLI.isShuffleMaskLegal(Mask, VT); } if (LegalMask) return DAG.getVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, Mask); } } } } // fold (or c1, c2) -> c1|c2 ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, N0C, N1C); // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0); // fold (or x, 0) -> x if (isNullConstant(N1)) return N0; // fold (or x, -1) -> -1 if (isAllOnesConstant(N1)) return N1; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (or x, c) -> c iff (x & ~c) == 0 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue())) return N1; if (SDValue Combined = visitORLike(N0, N1, N)) return Combined; // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16) if (SDValue BSwap = MatchBSwapHWord(N, N0, N1)) return BSwap; if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1)) return BSwap; // reassociate or if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1)) return ROR; // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2) // iff (c1 & c2) != 0. auto MatchIntersect = [](ConstantSDNode *LHS, ConstantSDNode *RHS) { return LHS->getAPIntValue().intersects(RHS->getAPIntValue()); }; if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect)) { if (SDValue COR = DAG.FoldConstantArithmetic( ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) { SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1); AddToWorklist(IOR.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR); } } // Simplify: (or (op x...), (op y...)) -> (op (or x, y)) if (N0.getOpcode() == N1.getOpcode()) if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) return Tmp; // See if this is some rotate idiom. if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N))) return SDValue(Rot, 0); if (SDValue Load = MatchLoadCombine(N)) return Load; // Simplify the operands using demanded-bits information. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); } static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) { if (Op.getOpcode() == ISD::AND && DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) { Mask = Op.getOperand(1); return Op.getOperand(0); } return Op; } /// Match "(X shl/srl V1) & V2" where V2 may not be present. static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift, SDValue &Mask) { Op = stripConstantMask(DAG, Op, Mask); if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) { Shift = Op; return true; } return false; } /// Helper function for visitOR to extract the needed side of a rotate idiom /// from a shl/srl/mul/udiv. This is meant to handle cases where /// InstCombine merged some outside op with one of the shifts from /// the rotate pattern. /// \returns An empty \c SDValue if the needed shift couldn't be extracted. /// Otherwise, returns an expansion of \p ExtractFrom based on the following /// patterns: /// /// (or (mul v c0) (shrl (mul v c1) c2)): /// expands (mul v c0) -> (shl (mul v c1) c3) /// /// (or (udiv v c0) (shl (udiv v c1) c2)): /// expands (udiv v c0) -> (shrl (udiv v c1) c3) /// /// (or (shl v c0) (shrl (shl v c1) c2)): /// expands (shl v c0) -> (shl (shl v c1) c3) /// /// (or (shrl v c0) (shl (shrl v c1) c2)): /// expands (shrl v c0) -> (shrl (shrl v c1) c3) /// /// Such that in all cases, c3+c2==bitwidth(op v c1). static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift, SDValue ExtractFrom, SDValue &Mask, const SDLoc &DL) { assert(OppShift && ExtractFrom && "Empty SDValue"); assert( (OppShift.getOpcode() == ISD::SHL || OppShift.getOpcode() == ISD::SRL) && "Existing shift must be valid as a rotate half"); ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask); // Preconditions: // (or (op0 v c0) (shiftl/r (op0 v c1) c2)) // // Find opcode of the needed shift to be extracted from (op0 v c0). unsigned Opcode = ISD::DELETED_NODE; bool IsMulOrDiv = false; // Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift // opcode or its arithmetic (mul or udiv) variant. auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) { IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant; if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift) return false; Opcode = NeededShift; return true; }; // op0 must be either the needed shift opcode or the mul/udiv equivalent // that the needed shift can be extracted from. if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) && (OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV))) return SDValue(); // op0 must be the same opcode on both sides, have the same LHS argument, // and produce the same value type. SDValue OppShiftLHS = OppShift.getOperand(0); EVT ShiftedVT = OppShiftLHS.getValueType(); if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() || OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) || ShiftedVT != ExtractFrom.getValueType()) return SDValue(); // Amount of the existing shift. ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1)); // Constant mul/udiv/shift amount from the RHS of the shift's LHS op. ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1)); // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op. ConstantSDNode *ExtractFromCst = isConstOrConstSplat(ExtractFrom.getOperand(1)); // TODO: We should be able to handle non-uniform constant vectors for these values // Check that we have constant values. if (!OppShiftCst || !OppShiftCst->getAPIntValue() || !OppLHSCst || !OppLHSCst->getAPIntValue() || !ExtractFromCst || !ExtractFromCst->getAPIntValue()) return SDValue(); // Compute the shift amount we need to extract to complete the rotate. const unsigned VTWidth = ShiftedVT.getScalarSizeInBits(); APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue(); if (NeededShiftAmt.isNegative()) return SDValue(); // Normalize the bitwidth of the two mul/udiv/shift constant operands. APInt ExtractFromAmt = ExtractFromCst->getAPIntValue(); APInt OppLHSAmt = OppLHSCst->getAPIntValue(); zeroExtendToMatch(ExtractFromAmt, OppLHSAmt); // Now try extract the needed shift from the ExtractFrom op and see if the // result matches up with the existing shift's LHS op. if (IsMulOrDiv) { // Op to extract from is a mul or udiv by a constant. // Check: // c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0 // c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0 const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(), NeededShiftAmt.getZExtValue()); APInt ResultAmt; APInt Rem; APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem); if (Rem != 0 || ResultAmt != OppLHSAmt) return SDValue(); } else { // Op to extract from is a shift by a constant. // Check: // c2 - (bitwidth(op0 v c0) - c1) == c0 if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc( ExtractFromAmt.getBitWidth())) return SDValue(); } // Return the expanded shift op that should allow a rotate to be formed. EVT ShiftVT = OppShift.getOperand(1).getValueType(); EVT ResVT = ExtractFrom.getValueType(); SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT); return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode); } // Return true if we can prove that, whenever Neg and Pos are both in the // range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that // for two opposing shifts shift1 and shift2 and a value X with OpBits bits: // // (or (shift1 X, Neg), (shift2 X, Pos)) // // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate // in direction shift1 by Neg. The range [0, EltSize) means that we only need // to consider shift amounts with defined behavior. static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize, SelectionDAG &DAG) { // If EltSize is a power of 2 then: // // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1) // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize). // // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check // for the stronger condition: // // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A] // // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1) // we can just replace Neg with Neg' for the rest of the function. // // In other cases we check for the even stronger condition: // // Neg == EltSize - Pos [B] // // for all Neg and Pos. Note that the (or ...) then invokes undefined // behavior if Pos == 0 (and consequently Neg == EltSize). // // We could actually use [A] whenever EltSize is a power of 2, but the // only extra cases that it would match are those uninteresting ones // where Neg and Pos are never in range at the same time. E.g. for // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos) // as well as (sub 32, Pos), but: // // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos)) // // always invokes undefined behavior for 32-bit X. // // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise. unsigned MaskLoBits = 0; if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) { if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) { KnownBits Known; DAG.computeKnownBits(Neg.getOperand(0), Known); unsigned Bits = Log2_64(EltSize); if (NegC->getAPIntValue().getActiveBits() <= Bits && ((NegC->getAPIntValue() | Known.Zero).countTrailingOnes() >= Bits)) { Neg = Neg.getOperand(0); MaskLoBits = Bits; } } } // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1. if (Neg.getOpcode() != ISD::SUB) return false; ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0)); if (!NegC) return false; SDValue NegOp1 = Neg.getOperand(1); // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with // Pos'. The truncation is redundant for the purpose of the equality. if (MaskLoBits && Pos.getOpcode() == ISD::AND) { if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) { KnownBits Known; DAG.computeKnownBits(Pos.getOperand(0), Known); if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits && ((PosC->getAPIntValue() | Known.Zero).countTrailingOnes() >= MaskLoBits)) Pos = Pos.getOperand(0); } } // The condition we need is now: // // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask // // If NegOp1 == Pos then we need: // // EltSize & Mask == NegC & Mask // // (because "x & Mask" is a truncation and distributes through subtraction). APInt Width; if (Pos == NegOp1) Width = NegC->getAPIntValue(); // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC. // Then the condition we want to prove becomes: // // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask // // which, again because "x & Mask" is a truncation, becomes: // // NegC & Mask == (EltSize - PosC) & Mask // EltSize & Mask == (NegC + PosC) & Mask else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) { if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) Width = PosC->getAPIntValue() + NegC->getAPIntValue(); else return false; } else return false; // Now we just need to check that EltSize & Mask == Width & Mask. if (MaskLoBits) // EltSize & Mask is 0 since Mask is EltSize - 1. return Width.getLoBits(MaskLoBits) == 0; return Width == EltSize; } // A subroutine of MatchRotate used once we have found an OR of two opposite // shifts of Shifted. If Neg == - Pos then the OR reduces // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the // former being preferred if supported. InnerPos and InnerNeg are Pos and // Neg with outer conversions stripped away. SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg, SDValue InnerPos, SDValue InnerNeg, unsigned PosOpcode, unsigned NegOpcode, const SDLoc &DL) { // fold (or (shl x, (*ext y)), // (srl x, (*ext (sub 32, y)))) -> // (rotl x, y) or (rotr x, (sub 32, y)) // // fold (or (shl x, (*ext (sub 32, y))), // (srl x, (*ext y))) -> // (rotr x, y) or (rotl x, (sub 32, y)) EVT VT = Shifted.getValueType(); if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) { bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT); return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted, HasPos ? Pos : Neg).getNode(); } return nullptr; } // MatchRotate - Handle an 'or' of two operands. If this is one of the many // idioms for rotate, and if the target supports rotation instructions, generate // a rot[lr]. SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) { // Must be a legal type. Expanded 'n promoted things won't work with rotates. EVT VT = LHS.getValueType(); if (!TLI.isTypeLegal(VT)) return nullptr; // The target must have at least one rotate flavor. bool HasROTL = hasOperation(ISD::ROTL, VT); bool HasROTR = hasOperation(ISD::ROTR, VT); if (!HasROTL && !HasROTR) return nullptr; // Check for truncated rotate. if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE && LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) { assert(LHS.getValueType() == RHS.getValueType()); if (SDNode *Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) { return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), SDValue(Rot, 0)).getNode(); } } // Match "(X shl/srl V1) & V2" where V2 may not be present. SDValue LHSShift; // The shift. SDValue LHSMask; // AND value if any. matchRotateHalf(DAG, LHS, LHSShift, LHSMask); SDValue RHSShift; // The shift. SDValue RHSMask; // AND value if any. matchRotateHalf(DAG, RHS, RHSShift, RHSMask); // If neither side matched a rotate half, bail if (!LHSShift && !RHSShift) return nullptr; // InstCombine may have combined a constant shl, srl, mul, or udiv with one // side of the rotate, so try to handle that here. In all cases we need to // pass the matched shift from the opposite side to compute the opcode and // needed shift amount to extract. We still want to do this if both sides // matched a rotate half because one half may be a potential overshift that // can be broken down (ie if InstCombine merged two shl or srl ops into a // single one). // Have LHS side of the rotate, try to extract the needed shift from the RHS. if (LHSShift) if (SDValue NewRHSShift = extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL)) RHSShift = NewRHSShift; // Have RHS side of the rotate, try to extract the needed shift from the LHS. if (RHSShift) if (SDValue NewLHSShift = extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL)) LHSShift = NewLHSShift; // If a side is still missing, nothing else we can do. if (!RHSShift || !LHSShift) return nullptr; // At this point we've matched or extracted a shift op on each side. if (LHSShift.getOperand(0) != RHSShift.getOperand(0)) return nullptr; // Not shifting the same value. if (LHSShift.getOpcode() == RHSShift.getOpcode()) return nullptr; // Shifts must disagree. // Canonicalize shl to left side in a shl/srl pair. if (RHSShift.getOpcode() == ISD::SHL) { std::swap(LHS, RHS); std::swap(LHSShift, RHSShift); std::swap(LHSMask, RHSMask); } unsigned EltSizeInBits = VT.getScalarSizeInBits(); SDValue LHSShiftArg = LHSShift.getOperand(0); SDValue LHSShiftAmt = LHSShift.getOperand(1); SDValue RHSShiftArg = RHSShift.getOperand(0); SDValue RHSShiftAmt = RHSShift.getOperand(1); // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1) // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2) auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits; }; if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) { SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt); // If there is an AND of either shifted operand, apply it to the result. if (LHSMask.getNode() || RHSMask.getNode()) { SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); SDValue Mask = AllOnes; if (LHSMask.getNode()) { SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt); Mask = DAG.getNode(ISD::AND, DL, VT, Mask, DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits)); } if (RHSMask.getNode()) { SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt); Mask = DAG.getNode(ISD::AND, DL, VT, Mask, DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits)); } Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask); } return Rot.getNode(); } // If there is a mask here, and we have a variable shift, we can't be sure // that we're masking out the right stuff. if (LHSMask.getNode() || RHSMask.getNode()) return nullptr; // If the shift amount is sign/zext/any-extended just peel it off. SDValue LExtOp0 = LHSShiftAmt; SDValue RExtOp0 = RHSShiftAmt; if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || LHSShiftAmt.getOpcode() == ISD::TRUNCATE) && (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND || RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND || RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND || RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) { LExtOp0 = LHSShiftAmt.getOperand(0); RExtOp0 = RHSShiftAmt.getOperand(0); } SDNode *TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL); if (TryL) return TryL; SDNode *TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL); if (TryR) return TryR; return nullptr; } namespace { /// Represents known origin of an individual byte in load combine pattern. The /// value of the byte is either constant zero or comes from memory. struct ByteProvider { // For constant zero providers Load is set to nullptr. For memory providers // Load represents the node which loads the byte from memory. // ByteOffset is the offset of the byte in the value produced by the load. LoadSDNode *Load = nullptr; unsigned ByteOffset = 0; ByteProvider() = default; static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) { return ByteProvider(Load, ByteOffset); } static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); } bool isConstantZero() const { return !Load; } bool isMemory() const { return Load; } bool operator==(const ByteProvider &Other) const { return Other.Load == Load && Other.ByteOffset == ByteOffset; } private: ByteProvider(LoadSDNode *Load, unsigned ByteOffset) : Load(Load), ByteOffset(ByteOffset) {} }; } // end anonymous namespace /// Recursively traverses the expression calculating the origin of the requested /// byte of the given value. Returns None if the provider can't be calculated. /// /// For all the values except the root of the expression verifies that the value /// has exactly one use and if it's not true return None. This way if the origin /// of the byte is returned it's guaranteed that the values which contribute to /// the byte are not used outside of this expression. /// /// Because the parts of the expression are not allowed to have more than one /// use this function iterates over trees, not DAGs. So it never visits the same /// node more than once. static const Optional calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, bool Root = false) { // Typical i64 by i8 pattern requires recursion up to 8 calls depth if (Depth == 10) return None; if (!Root && !Op.hasOneUse()) return None; assert(Op.getValueType().isScalarInteger() && "can't handle other types"); unsigned BitWidth = Op.getValueSizeInBits(); if (BitWidth % 8 != 0) return None; unsigned ByteWidth = BitWidth / 8; assert(Index < ByteWidth && "invalid index requested"); (void) ByteWidth; switch (Op.getOpcode()) { case ISD::OR: { auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1); if (!LHS) return None; auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1); if (!RHS) return None; if (LHS->isConstantZero()) return RHS; if (RHS->isConstantZero()) return LHS; return None; } case ISD::SHL: { auto ShiftOp = dyn_cast(Op->getOperand(1)); if (!ShiftOp) return None; uint64_t BitShift = ShiftOp->getZExtValue(); if (BitShift % 8 != 0) return None; uint64_t ByteShift = BitShift / 8; return Index < ByteShift ? ByteProvider::getConstantZero() : calculateByteProvider(Op->getOperand(0), Index - ByteShift, Depth + 1); } case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: { SDValue NarrowOp = Op->getOperand(0); unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); if (NarrowBitWidth % 8 != 0) return None; uint64_t NarrowByteWidth = NarrowBitWidth / 8; if (Index >= NarrowByteWidth) return Op.getOpcode() == ISD::ZERO_EXTEND ? Optional(ByteProvider::getConstantZero()) : None; return calculateByteProvider(NarrowOp, Index, Depth + 1); } case ISD::BSWAP: return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1, Depth + 1); case ISD::LOAD: { auto L = cast(Op.getNode()); if (L->isVolatile() || L->isIndexed()) return None; unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); if (NarrowBitWidth % 8 != 0) return None; uint64_t NarrowByteWidth = NarrowBitWidth / 8; if (Index >= NarrowByteWidth) return L->getExtensionType() == ISD::ZEXTLOAD ? Optional(ByteProvider::getConstantZero()) : None; return ByteProvider::getMemory(L, Index); } } return None; } /// Match a pattern where a wide type scalar value is loaded by several narrow /// loads and combined by shifts and ors. Fold it into a single load or a load /// and a BSWAP if the targets supports it. /// /// Assuming little endian target: /// i8 *a = ... /// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24) /// => /// i32 val = *((i32)a) /// /// i8 *a = ... /// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3] /// => /// i32 val = BSWAP(*((i32)a)) /// /// TODO: This rule matches complex patterns with OR node roots and doesn't /// interact well with the worklist mechanism. When a part of the pattern is /// updated (e.g. one of the loads) its direct users are put into the worklist, /// but the root node of the pattern which triggers the load combine is not /// necessarily a direct user of the changed node. For example, once the address /// of t28 load is reassociated load combine won't be triggered: /// t25: i32 = add t4, Constant:i32<2> /// t26: i64 = sign_extend t25 /// t27: i64 = add t2, t26 /// t28: i8,ch = load t0, t27, undef:i64 /// t29: i32 = zero_extend t28 /// t32: i32 = shl t29, Constant:i8<8> /// t33: i32 = or t23, t32 /// As a possible fix visitLoad can check if the load can be a part of a load /// combine pattern and add corresponding OR roots to the worklist. SDValue DAGCombiner::MatchLoadCombine(SDNode *N) { assert(N->getOpcode() == ISD::OR && "Can only match load combining against OR nodes"); // Handles simple types only EVT VT = N->getValueType(0); if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); unsigned ByteWidth = VT.getSizeInBits() / 8; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Before legalize we can introduce too wide illegal loads which will be later // split into legal sized loads. This enables us to combine i64 load by i8 // patterns to a couple of i32 loads on 32 bit targets. if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT)) return SDValue(); std::function LittleEndianByteAt = []( unsigned BW, unsigned i) { return i; }; std::function BigEndianByteAt = []( unsigned BW, unsigned i) { return BW - i - 1; }; bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian(); auto MemoryByteOffset = [&] (ByteProvider P) { assert(P.isMemory() && "Must be a memory byte provider"); unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits(); assert(LoadBitWidth % 8 == 0 && "can only analyze providers for individual bytes not bit"); unsigned LoadByteWidth = LoadBitWidth / 8; return IsBigEndianTarget ? BigEndianByteAt(LoadByteWidth, P.ByteOffset) : LittleEndianByteAt(LoadByteWidth, P.ByteOffset); }; Optional Base; SDValue Chain; SmallPtrSet Loads; Optional FirstByteProvider; int64_t FirstOffset = INT64_MAX; // Check if all the bytes of the OR we are looking at are loaded from the same // base address. Collect bytes offsets from Base address in ByteOffsets. SmallVector ByteOffsets(ByteWidth); for (unsigned i = 0; i < ByteWidth; i++) { auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true); if (!P || !P->isMemory()) // All the bytes must be loaded from memory return SDValue(); LoadSDNode *L = P->Load; assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() && "Must be enforced by calculateByteProvider"); assert(L->getOffset().isUndef() && "Unindexed load must have undef offset"); // All loads must share the same chain SDValue LChain = L->getChain(); if (!Chain) Chain = LChain; else if (Chain != LChain) return SDValue(); // Loads must share the same base address BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG); int64_t ByteOffsetFromBase = 0; if (!Base) Base = Ptr; else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase)) return SDValue(); // Calculate the offset of the current byte from the base address ByteOffsetFromBase += MemoryByteOffset(*P); ByteOffsets[i] = ByteOffsetFromBase; // Remember the first byte load if (ByteOffsetFromBase < FirstOffset) { FirstByteProvider = P; FirstOffset = ByteOffsetFromBase; } Loads.insert(L); } assert(!Loads.empty() && "All the bytes of the value must be loaded from " "memory, so there must be at least one load which produces the value"); assert(Base && "Base address of the accessed memory location must be set"); assert(FirstOffset != INT64_MAX && "First byte offset must be set"); // Check if the bytes of the OR we are looking at match with either big or // little endian value load bool BigEndian = true, LittleEndian = true; for (unsigned i = 0; i < ByteWidth; i++) { int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset; LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i); BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i); if (!BigEndian && !LittleEndian) return SDValue(); } assert((BigEndian != LittleEndian) && "should be either or"); assert(FirstByteProvider && "must be set"); // Ensure that the first byte is loaded from zero offset of the first load. // So the combined value can be loaded from the first load address. if (MemoryByteOffset(*FirstByteProvider) != 0) return SDValue(); LoadSDNode *FirstLoad = FirstByteProvider->Load; // The node we are looking at matches with the pattern, check if we can // replace it with a single load and bswap if needed. // If the load needs byte swap check if the target supports it bool NeedsBswap = IsBigEndianTarget != BigEndian; // Before legalize we can introduce illegal bswaps which will be later // converted to an explicit bswap sequence. This way we end up with a single // load and byte shuffling instead of several loads and byte shuffling. if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT)) return SDValue(); // Check that a load of the wide type is both allowed and fast on the target bool Fast = false; bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, FirstLoad->getAddressSpace(), FirstLoad->getAlignment(), &Fast); if (!Allowed || !Fast) return SDValue(); SDValue NewLoad = DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), FirstLoad->getAlignment()); // Transfer chain users from old loads to the new load. for (LoadSDNode *L : Loads) DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1)); return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad; } // If the target has andn, bsl, or a similar bit-select instruction, // we want to unfold masked merge, with canonical pattern of: // | A | |B| // ((x ^ y) & m) ^ y // | D | // Into: // (x & m) | (y & ~m) // If y is a constant, and the 'andn' does not work with immediates, // we unfold into a different pattern: // ~(~x & m) & (m | y) // NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at // the very least that breaks andnpd / andnps patterns, and because those // patterns are simplified in IR and shouldn't be created in the DAG SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) { assert(N->getOpcode() == ISD::XOR); // Don't touch 'not' (i.e. where y = -1). if (isAllOnesConstantOrAllOnesSplatConstant(N->getOperand(1))) return SDValue(); EVT VT = N->getValueType(0); // There are 3 commutable operators in the pattern, // so we have to deal with 8 possible variants of the basic pattern. SDValue X, Y, M; auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) { if (And.getOpcode() != ISD::AND || !And.hasOneUse()) return false; SDValue Xor = And.getOperand(XorIdx); if (Xor.getOpcode() != ISD::XOR || !Xor.hasOneUse()) return false; SDValue Xor0 = Xor.getOperand(0); SDValue Xor1 = Xor.getOperand(1); // Don't touch 'not' (i.e. where y = -1). if (isAllOnesConstantOrAllOnesSplatConstant(Xor1)) return false; if (Other == Xor0) std::swap(Xor0, Xor1); if (Other != Xor1) return false; X = Xor0; Y = Xor1; M = And.getOperand(XorIdx ? 0 : 1); return true; }; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) && !matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0)) return SDValue(); // Don't do anything if the mask is constant. This should not be reachable. // InstCombine should have already unfolded this pattern, and DAGCombiner // probably shouldn't produce it, too. if (isa(M.getNode())) return SDValue(); // We can transform if the target has AndNot if (!TLI.hasAndNot(M)) return SDValue(); SDLoc DL(N); // If Y is a constant, check that 'andn' works with immediates. if (!TLI.hasAndNot(Y)) { assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable."); // If not, we need to do a bit more work to make sure andn is still used. SDValue NotX = DAG.getNOT(DL, X, VT); SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M); SDValue NotLHS = DAG.getNOT(DL, LHS, VT); SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y); return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS); } SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M); SDValue NotM = DAG.getNOT(DL, M, VT); SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM); return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); } SDValue DAGCombiner::visitXOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (xor x, 0) -> x, vector edition if (ISD::isBuildVectorAllZeros(N0.getNode())) return N1; if (ISD::isBuildVectorAllZeros(N1.getNode())) return N0; } // fold (xor undef, undef) -> 0. This is a common idiom (misuse). if (N0.isUndef() && N1.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); // fold (xor x, undef) -> undef if (N0.isUndef()) return N0; if (N1.isUndef()) return N1; // fold (xor c1, c2) -> c1^c2 ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); ConstantSDNode *N1C = getAsNonOpaqueConstant(N1); if (N0C && N1C) return DAG.FoldConstantArithmetic(ISD::XOR, SDLoc(N), VT, N0C, N1C); // canonicalize constant to RHS if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && !DAG.isConstantIntBuildVectorOrConstantInt(N1)) return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0); // fold (xor x, 0) -> x if (isNullConstant(N1)) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // reassociate xor if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1)) return RXOR; // fold !(x cc y) -> (x !cc y) SDValue LHS, RHS, CC; if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) { bool isInt = LHS.getValueType().isInteger(); ISD::CondCode NotCC = ISD::getSetCCInverse(cast(CC)->get(), isInt); if (!LegalOperations || TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) { switch (N0.getOpcode()) { default: llvm_unreachable("Unhandled SetCC Equivalent!"); case ISD::SETCC: return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC); case ISD::SELECT_CC: return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2), N0.getOperand(3), NotCC); } } } // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y))) if (isOneConstant(N1) && N0.getOpcode() == ISD::ZERO_EXTEND && N0.getNode()->hasOneUse() && isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){ SDValue V = N0.getOperand(0); SDLoc DL(N0); V = DAG.getNode(ISD::XOR, DL, V.getValueType(), V, DAG.getConstant(1, DL, V.getValueType())); AddToWorklist(V.getNode()); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V); } // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() && (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) { unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND; LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode()); return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } } // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants if (isAllOnesConstant(N1) && N0.hasOneUse() && (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); if (isa(RHS) || isa(LHS)) { unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND; LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode()); return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS); } } // fold (xor (and x, y), y) -> (and (not x), y) if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && N0->getOperand(1) == N1) { SDValue X = N0->getOperand(0); SDValue NotX = DAG.getNOT(SDLoc(X), X, VT); AddToWorklist(NotX.getNode()); return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1); } // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X) if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { SDValue A = N0.getOpcode() == ISD::ADD ? N0 : N1; SDValue S = N0.getOpcode() == ISD::SRA ? N0 : N1; if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) { SDValue A0 = A.getOperand(0), A1 = A.getOperand(1); SDValue S0 = S.getOperand(0); if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0)) { unsigned OpSizeInBits = VT.getScalarSizeInBits(); if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1))) if (C->getAPIntValue() == (OpSizeInBits - 1)) return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0); } } } // fold (xor x, x) -> 0 if (N0 == N1) return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes); // fold (xor (shl 1, x), -1) -> (rotl ~1, x) // Here is a concrete example of this equivalence: // i16 x == 14 // i16 shl == 1 << 14 == 16384 == 0b0100000000000000 // i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111 // // => // // i16 ~1 == 0b1111111111111110 // i16 rol(~1, 14) == 0b1011111111111111 // // Some additional tips to help conceptualize this transform: // - Try to see the operation as placing a single zero in a value of all ones. // - There exists no value for x which would allow the result to contain zero. // - Values of x larger than the bitwidth are undefined and do not require a // consistent result. // - Pushing the zero left requires shifting one bits in from the right. // A rotate left of ~1 is a nice way of achieving the desired result. if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0.getOpcode() == ISD::SHL && isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) { SDLoc DL(N); return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT), N0.getOperand(1)); } // Simplify: xor (op x...), (op y...) -> (op (xor x, y)) if (N0.getOpcode() == N1.getOpcode()) if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N)) return Tmp; // Unfold ((x ^ y) & m) ^ y into (x & m) | (y & ~m) if profitable if (SDValue MM = unfoldMaskedMerge(N)) return MM; // Simplify the expression using non-local knowledge. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); } /// Handle transforms common to the three shifts, when the shift amount is a /// constant. SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) { SDNode *LHS = N->getOperand(0).getNode(); if (!LHS->hasOneUse()) return SDValue(); // We want to pull some binops through shifts, so that we have (and (shift)) // instead of (shift (and)), likewise for add, or, xor, etc. This sort of // thing happens with address calculations, so it's important to canonicalize // it. bool HighBitSet = false; // Can we transform this if the high bit is set? switch (LHS->getOpcode()) { default: return SDValue(); case ISD::OR: case ISD::XOR: HighBitSet = false; // We can only transform sra if the high bit is clear. break; case ISD::AND: HighBitSet = true; // We can only transform sra if the high bit is set. break; case ISD::ADD: if (N->getOpcode() != ISD::SHL) return SDValue(); // only shl(add) not sr[al](add). HighBitSet = false; // We can only transform sra if the high bit is clear. break; } // We require the RHS of the binop to be a constant and not opaque as well. ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS->getOperand(1)); if (!BinOpCst) return SDValue(); // FIXME: disable this unless the input to the binop is a shift by a constant // or is copy/select.Enable this in other cases when figure out it's exactly profitable. SDNode *BinOpLHSVal = LHS->getOperand(0).getNode(); bool isShift = BinOpLHSVal->getOpcode() == ISD::SHL || BinOpLHSVal->getOpcode() == ISD::SRA || BinOpLHSVal->getOpcode() == ISD::SRL; bool isCopyOrSelect = BinOpLHSVal->getOpcode() == ISD::CopyFromReg || BinOpLHSVal->getOpcode() == ISD::SELECT; if ((!isShift || !isa(BinOpLHSVal->getOperand(1))) && !isCopyOrSelect) return SDValue(); if (isCopyOrSelect && N->hasOneUse()) return SDValue(); EVT VT = N->getValueType(0); // If this is a signed shift right, and the high bit is modified by the // logical operation, do not perform the transformation. The highBitSet // boolean indicates the value of the high bit of the constant which would // cause it to be modified for this operation. if (N->getOpcode() == ISD::SRA) { bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative(); if (BinOpRHSSignSet != HighBitSet) return SDValue(); } - if (!TLI.isDesirableToCommuteWithShift(LHS)) + if (!TLI.isDesirableToCommuteWithShift(N, Level)) return SDValue(); // Fold the constants, shifting the binop RHS by the shift amount. SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)), N->getValueType(0), LHS->getOperand(1), N->getOperand(1)); assert(isa(NewRHS) && "Folding was not successful!"); // Create the new shift. SDValue NewShift = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(0)), VT, LHS->getOperand(0), N->getOperand(1)); // Create the new binop. return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS); } SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) { assert(N->getOpcode() == ISD::TRUNCATE); assert(N->getOperand(0).getOpcode() == ISD::AND); // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC) if (N->hasOneUse() && N->getOperand(0).hasOneUse()) { SDValue N01 = N->getOperand(0).getOperand(1); if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) { SDLoc DL(N); EVT TruncVT = N->getValueType(0); SDValue N00 = N->getOperand(0).getOperand(0); SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00); SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01); AddToWorklist(Trunc00.getNode()); AddToWorklist(Trunc01.getNode()); return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01); } } return SDValue(); } SDValue DAGCombiner::visitRotate(SDNode *N) { SDLoc dl(N); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); unsigned Bitsize = VT.getScalarSizeInBits(); // fold (rot x, 0) -> x if (isNullConstantOrNullSplatConstant(N1)) return N0; // fold (rot x, c) -> (rot x, c % BitSize) if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) { if (Cst->getAPIntValue().uge(Bitsize)) { uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize); return DAG.getNode(N->getOpcode(), dl, VT, N0, DAG.getConstant(RotAmt, dl, N1.getValueType())); } } // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1); } unsigned NextOp = N0.getOpcode(); // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize) if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) { SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1); SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)); if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) { EVT ShiftVT = C1->getValueType(0); bool SameSide = (N->getOpcode() == NextOp); unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB; if (SDValue CombinedShift = DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) { SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT); SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic( ISD::SREM, dl, ShiftVT, CombinedShift.getNode(), BitsizeC.getNode()); return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0), CombinedShiftNorm); } } } return SDValue(); } SDValue DAGCombiner::visitSHL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); // fold vector ops if (VT.isVector()) { if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; BuildVectorSDNode *N1CV = dyn_cast(N1); // If setcc produces all-one true value then: // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<isConstant()) { if (N0.getOpcode() == ISD::AND) { SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); BuildVectorSDNode *N01CV = dyn_cast(N01); if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC && TLI.getBooleanContents(N00.getOperand(0).getValueType()) == TargetLowering::ZeroOrNegativeOneBooleanContent) { if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N01CV, N1CV)) return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C); } } } } ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (shl c1, c2) -> c1<isOpaque()) return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C); // fold (shl 0, x) -> 0 if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (shl x, c >= size(x)) -> undef // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { return Val->getAPIntValue().uge(OpSizeInBits); }; if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig)) return DAG.getUNDEF(VT); // fold (shl x, 0) -> x if (N1C && N1C->isNullValue()) return N0; // fold (shl undef, x) -> 0 if (N0.isUndef()) return DAG.getConstant(0, SDLoc(N), VT); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // if (shl x, c) is known to be zero, return 0 if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) return DAG.getConstant(0, SDLoc(N), VT); // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1); } if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2)) if (N0.getOpcode() == ISD::SHL) { auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).uge(OpSizeInBits); }; if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) return DAG.getConstant(0, SDLoc(N), VT); auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).ult(OpSizeInBits); }; if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { SDLoc DL(N); EVT ShiftVT = N1.getValueType(); SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum); } } // fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2))) // For this to be valid, the second form must not preserve any of the bits // that are shifted out by the inner shift in the first form. This means // the outer shift size must be >= the number of bits added by the ext. // As a corollary, we don't care what kind of ext it is. if (N1C && (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND) && N0.getOperand(0).getOpcode() == ISD::SHL) { SDValue N0Op0 = N0.getOperand(0); if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) { APInt c1 = N0Op0C1->getAPIntValue(); APInt c2 = N1C->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); EVT InnerShiftVT = N0Op0.getValueType(); uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); if (c2.uge(OpSizeInBits - InnerShiftSize)) { SDLoc DL(N0); APInt Sum = c1 + c2; if (Sum.uge(OpSizeInBits)) return DAG.getConstant(0, DL, VT); return DAG.getNode( ISD::SHL, DL, VT, DAG.getNode(N0.getOpcode(), DL, VT, N0Op0->getOperand(0)), DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType())); } } } // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C)) // Only fold this if the inner zext has no other uses to avoid increasing // the total number of instructions. if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::SRL) { SDValue N0Op0 = N0.getOperand(0); if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) { if (N0Op0C1->getAPIntValue().ult(VT.getScalarSizeInBits())) { uint64_t c1 = N0Op0C1->getZExtValue(); uint64_t c2 = N1C->getZExtValue(); if (c1 == c2) { SDValue NewOp0 = N0.getOperand(0); EVT CountVT = NewOp0.getOperand(1).getValueType(); SDLoc DL(N); SDValue NewSHL = DAG.getNode(ISD::SHL, DL, NewOp0.getValueType(), NewOp0, DAG.getConstant(c2, DL, CountVT)); AddToWorklist(NewSHL.getNode()); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL); } } } } // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 > C2 if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) && N0->getFlags().hasExact()) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { uint64_t C1 = N0C1->getZExtValue(); uint64_t C2 = N1C->getZExtValue(); SDLoc DL(N); if (C1 <= C2) return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), DAG.getConstant(C2 - C1, DL, N1.getValueType())); return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), DAG.getConstant(C1 - C2, DL, N1.getValueType())); } } // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or // (and (srl x, (sub c1, c2), MASK) // Only fold this if the inner shift has no other uses -- if it does, folding // this will increase the total number of instructions. if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) { uint64_t c1 = N0C1->getZExtValue(); if (c1 < OpSizeInBits) { uint64_t c2 = N1C->getZExtValue(); APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1); SDValue Shift; if (c2 > c1) { Mask <<= c2 - c1; SDLoc DL(N); Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), DAG.getConstant(c2 - c1, DL, N1.getValueType())); } else { Mask.lshrInPlace(c1 - c2); SDLoc DL(N); Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), DAG.getConstant(c1 - c2, DL, N1.getValueType())); } SDLoc DL(N0); return DAG.getNode(ISD::AND, DL, VT, Shift, DAG.getConstant(Mask, DL, VT)); } } } // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1)) if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) && isConstantOrConstantVector(N1, /* No Opaques */ true)) { SDLoc DL(N); SDValue AllBits = DAG.getAllOnesConstant(DL, VT); SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask); } // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) // Variant of version done on multiply, except mul by a power of 2 is turned // into a shift. if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) && N0.getNode()->hasOneUse() && isConstantOrConstantVector(N1, /* No Opaques */ true) && - isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) { + isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) && + TLI.isDesirableToCommuteWithShift(N, Level)) { SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1); SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); AddToWorklist(Shl0.getNode()); AddToWorklist(Shl1.getNode()); return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1); } // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2) if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() && isConstantOrConstantVector(N1, /* No Opaques */ true) && isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) { SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1); if (isConstantOrConstantVector(Shl)) return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl); } if (N1C && !N1C->isOpaque()) if (SDValue NewSHL = visitShiftByConstant(N, N1C)) return NewSHL; return SDValue(); } SDValue DAGCombiner::visitSRA(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); // Arithmetic shifting an all-sign-bit value is a no-op. // fold (sra 0, x) -> 0 // fold (sra -1, x) -> -1 if (DAG.ComputeNumSignBits(N0) == OpSizeInBits) return N0; // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (sra c1, c2) -> (sra c1, c2) ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C); // fold (sra x, c >= size(x)) -> undef // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { return Val->getAPIntValue().uge(OpSizeInBits); }; if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig)) return DAG.getUNDEF(VT); // fold (sra x, 0) -> x if (N1C && N1C->isNullValue()) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports // sext_inreg. if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) { unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue(); EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits); if (VT.isVector()) ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, VT.getVectorNumElements()); if ((!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT))) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), DAG.getValueType(ExtVT)); } // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) if (N0.getOpcode() == ISD::SRA) { SDLoc DL(N); EVT ShiftVT = N1.getValueType(); auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).uge(OpSizeInBits); }; if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), DAG.getConstant(OpSizeInBits - 1, DL, ShiftVT)); auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).ult(OpSizeInBits); }; if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), Sum); } } // fold (sra (shl X, m), (sub result_size, n)) // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for // result_size - n != m. // If truncate is free for the target sext(shl) is likely to result in better // code. if (N0.getOpcode() == ISD::SHL && N1C) { // Get the two constanst of the shifts, CN0 = m, CN = n. const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1)); if (N01C) { LLVMContext &Ctx = *DAG.getContext(); // Determine what the truncate's result bitsize and type would be. EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue()); if (VT.isVector()) TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements()); // Determine the residual right-shift amount. int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue(); // If the shift is not a no-op (in which case this should be just a sign // extend already), the truncated to type is legal, sign_extend is legal // on that type, and the truncate to that type is both legal and free, // perform the transform. if ((ShiftAmt > 0) && TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) && TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) && TLI.isTruncateFree(VT, TruncVT)) { SDLoc DL(N); SDValue Amt = DAG.getConstant(ShiftAmt, DL, getShiftAmountTy(N0.getOperand(0).getValueType())); SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Amt); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Shift); return DAG.getNode(ISD::SIGN_EXTEND, DL, N->getValueType(0), Trunc); } } } // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1); } // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2)) // if c1 is equal to the number of bits the trunc removes if (N0.getOpcode() == ISD::TRUNCATE && (N0.getOperand(0).getOpcode() == ISD::SRL || N0.getOperand(0).getOpcode() == ISD::SRA) && N0.getOperand(0).hasOneUse() && N0.getOperand(0).getOperand(1).hasOneUse() && N1C) { SDValue N0Op0 = N0.getOperand(0); if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) { unsigned LargeShiftVal = LargeShift->getZExtValue(); EVT LargeVT = N0Op0.getValueType(); if (LargeVT.getScalarSizeInBits() - OpSizeInBits == LargeShiftVal) { SDLoc DL(N); SDValue Amt = DAG.getConstant(LargeShiftVal + N1C->getZExtValue(), DL, getShiftAmountTy(N0Op0.getOperand(0).getValueType())); SDValue SRA = DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt); return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA); } } } // Simplify, based on bits shifted out of the LHS. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // If the sign bit is known to be zero, switch this to a SRL. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1); if (N1C && !N1C->isOpaque()) if (SDValue NewSRA = visitShiftByConstant(N, N1C)) return NewSRA; return SDValue(); } SDValue DAGCombiner::visitSRL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (srl c1, c2) -> c1 >>u c2 ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C); // fold (srl 0, x) -> 0 if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (srl x, c >= size(x)) -> undef // NOTE: ALL vector elements must be too big to avoid partial UNDEFs. auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) { return Val->getAPIntValue().uge(OpSizeInBits); }; if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig)) return DAG.getUNDEF(VT); // fold (srl x, 0) -> x if (N1C && N1C->isNullValue()) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // if (srl x, c) is known to be zero, return 0 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnesValue(OpSizeInBits))) return DAG.getConstant(0, SDLoc(N), VT); // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2)) if (N0.getOpcode() == ISD::SRL) { auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).uge(OpSizeInBits); }; if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange)) return DAG.getConstant(0, SDLoc(N), VT); auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS, ConstantSDNode *RHS) { APInt c1 = LHS->getAPIntValue(); APInt c2 = RHS->getAPIntValue(); zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */); return (c1 + c2).ult(OpSizeInBits); }; if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) { SDLoc DL(N); EVT ShiftVT = N1.getValueType(); SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1)); return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum); } } // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2))) if (N1C && N0.getOpcode() == ISD::TRUNCATE && N0.getOperand(0).getOpcode() == ISD::SRL) { if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) { uint64_t c1 = N001C->getZExtValue(); uint64_t c2 = N1C->getZExtValue(); EVT InnerShiftVT = N0.getOperand(0).getValueType(); EVT ShiftCountVT = N0.getOperand(0).getOperand(1).getValueType(); uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits(); // This is only valid if the OpSizeInBits + c1 = size of inner shift. if (c1 + OpSizeInBits == InnerShiftSize) { SDLoc DL(N0); if (c1 + c2 >= InnerShiftSize) return DAG.getConstant(0, DL, VT); return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(ISD::SRL, DL, InnerShiftVT, N0.getOperand(0).getOperand(0), DAG.getConstant(c1 + c2, DL, ShiftCountVT))); } } } // fold (srl (shl x, c), c) -> (and x, cst2) if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 && isConstantOrConstantVector(N1, /* NoOpaques */ true)) { SDLoc DL(N); SDValue Mask = DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1); AddToWorklist(Mask.getNode()); return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask); } // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask) if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) { // Shifting in all undef bits? EVT SmallVT = N0.getOperand(0).getValueType(); unsigned BitSize = SmallVT.getScalarSizeInBits(); if (N1C->getZExtValue() >= BitSize) return DAG.getUNDEF(VT); if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) { uint64_t ShiftAmt = N1C->getZExtValue(); SDLoc DL0(N0); SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT, N0.getOperand(0), DAG.getConstant(ShiftAmt, DL0, getShiftAmountTy(SmallVT))); AddToWorklist(SmallShift.getNode()); APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt); SDLoc DL(N); return DAG.getNode(ISD::AND, DL, VT, DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift), DAG.getConstant(Mask, DL, VT)); } } // fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign // bit, which is unmodified by sra. if (N1C && N1C->getZExtValue() + 1 == OpSizeInBits) { if (N0.getOpcode() == ISD::SRA) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1); } // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit). if (N1C && N0.getOpcode() == ISD::CTLZ && N1C->getAPIntValue() == Log2_32(OpSizeInBits)) { KnownBits Known; DAG.computeKnownBits(N0.getOperand(0), Known); // If any of the input bits are KnownOne, then the input couldn't be all // zeros, thus the result of the srl will always be zero. if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT); // If all of the bits input the to ctlz node are known to be zero, then // the result of the ctlz is "32" and the result of the shift is one. APInt UnknownBits = ~Known.Zero; if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT); // Otherwise, check to see if there is exactly one bit input to the ctlz. if (UnknownBits.isPowerOf2()) { // Okay, we know that only that the single bit specified by UnknownBits // could be set on input to the CTLZ node. If this bit is set, the SRL // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair // to an SRL/XOR pair, which is likely to simplify more. unsigned ShAmt = UnknownBits.countTrailingZeros(); SDValue Op = N0.getOperand(0); if (ShAmt) { SDLoc DL(N0); Op = DAG.getNode(ISD::SRL, DL, VT, Op, DAG.getConstant(ShAmt, DL, getShiftAmountTy(Op.getValueType()))); AddToWorklist(Op.getNode()); } SDLoc DL(N); return DAG.getNode(ISD::XOR, DL, VT, Op, DAG.getConstant(1, DL, VT)); } } // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))). if (N1.getOpcode() == ISD::TRUNCATE && N1.getOperand(0).getOpcode() == ISD::AND) { if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode())) return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1); } // fold operands of srl based on knowledge that the low bits are not // demanded. if (N1C && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); if (N1C && !N1C->isOpaque()) if (SDValue NewSRL = visitShiftByConstant(N, N1C)) return NewSRL; // Attempt to convert a srl of a load into a narrower zero-extending load. if (SDValue NarrowLoad = ReduceLoadWidth(N)) return NarrowLoad; // Here is a common situation. We want to optimize: // // %a = ... // %b = and i32 %a, 2 // %c = srl i32 %b, 1 // brcond i32 %c ... // // into // // %a = ... // %b = and %a, 2 // %c = setcc eq %b, 0 // brcond %c ... // // However when after the source operand of SRL is optimized into AND, the SRL // itself may not be optimized further. Look for it and add the BRCOND into // the worklist. if (N->hasOneUse()) { SDNode *Use = *N->use_begin(); if (Use->getOpcode() == ISD::BRCOND) AddToWorklist(Use); else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) { // Also look pass the truncate. Use = *Use->use_begin(); if (Use->getOpcode() == ISD::BRCOND) AddToWorklist(Use); } } return SDValue(); } SDValue DAGCombiner::visitABS(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (abs c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0); // fold (abs (abs x)) -> (abs x) if (N0.getOpcode() == ISD::ABS) return N0; // fold (abs x) -> x iff not-negative if (DAG.SignBitIsZero(N0)) return N0; return SDValue(); } SDValue DAGCombiner::visitBSWAP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (bswap c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0); // fold (bswap (bswap x)) -> x if (N0.getOpcode() == ISD::BSWAP) return N0->getOperand(0); return SDValue(); } SDValue DAGCombiner::visitBITREVERSE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (bitreverse c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0); // fold (bitreverse (bitreverse x)) -> x if (N0.getOpcode() == ISD::BITREVERSE) return N0.getOperand(0); return SDValue(); } SDValue DAGCombiner::visitCTLZ(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ctlz c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0); // If the value is known never to be zero, switch to the undef version. if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) { if (DAG.isKnownNeverZero(N0)) return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0); } return SDValue(); } SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ctlz_zero_undef c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitCTTZ(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (cttz c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0); // If the value is known never to be zero, switch to the undef version. if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) { if (DAG.isKnownNeverZero(N0)) return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0); } return SDValue(); } SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (cttz_zero_undef c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitCTPOP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ctpop c1) -> c2 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0); return SDValue(); } /// Generate Min/Max node static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode CC, const TargetLowering &TLI, SelectionDAG &DAG) { if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True)) return SDValue(); switch (CC) { case ISD::SETOLT: case ISD::SETOLE: case ISD::SETLT: case ISD::SETLE: case ISD::SETULT: case ISD::SETULE: { unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM; if (TLI.isOperationLegal(Opcode, VT)) return DAG.getNode(Opcode, DL, VT, LHS, RHS); return SDValue(); } case ISD::SETOGT: case ISD::SETOGE: case ISD::SETGT: case ISD::SETGE: case ISD::SETUGT: case ISD::SETUGE: { unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM; if (TLI.isOperationLegal(Opcode, VT)) return DAG.getNode(Opcode, DL, VT, LHS, RHS); return SDValue(); } default: return SDValue(); } } SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) { SDValue Cond = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); EVT VT = N->getValueType(0); EVT CondVT = Cond.getValueType(); SDLoc DL(N); if (!VT.isInteger()) return SDValue(); auto *C1 = dyn_cast(N1); auto *C2 = dyn_cast(N2); if (!C1 || !C2) return SDValue(); // Only do this before legalization to avoid conflicting with target-specific // transforms in the other direction (create a select from a zext/sext). There // is also a target-independent combine here in DAGCombiner in the other // direction for (select Cond, -1, 0) when the condition is not i1. if (CondVT == MVT::i1 && !LegalOperations) { if (C1->isNullValue() && C2->isOne()) { // select Cond, 0, 1 --> zext (!Cond) SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); if (VT != MVT::i1) NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond); return NotCond; } if (C1->isNullValue() && C2->isAllOnesValue()) { // select Cond, 0, -1 --> sext (!Cond) SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1); if (VT != MVT::i1) NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond); return NotCond; } if (C1->isOne() && C2->isNullValue()) { // select Cond, 1, 0 --> zext (Cond) if (VT != MVT::i1) Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); return Cond; } if (C1->isAllOnesValue() && C2->isNullValue()) { // select Cond, -1, 0 --> sext (Cond) if (VT != MVT::i1) Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); return Cond; } // For any constants that differ by 1, we can transform the select into an // extend and add. Use a target hook because some targets may prefer to // transform in the other direction. if (TLI.convertSelectOfConstantsToMath(VT)) { if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) { // select Cond, C1, C1-1 --> add (zext Cond), C1-1 if (VT != MVT::i1) Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); } if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) { // select Cond, C1, C1+1 --> add (sext Cond), C1+1 if (VT != MVT::i1) Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); return DAG.getNode(ISD::ADD, DL, VT, Cond, N2); } } return SDValue(); } // fold (select Cond, 0, 1) -> (xor Cond, 1) // We can't do this reliably if integer based booleans have different contents // to floating point based booleans. This is because we can't tell whether we // have an integer-based boolean or a floating-point-based boolean unless we // can find the SETCC that produced it and inspect its operands. This is // fairly easy if C is the SETCC node, but it can potentially be // undiscoverable (or not reasonably discoverable). For example, it could be // in another basic block or it could require searching a complicated // expression. if (CondVT.isInteger() && TLI.getBooleanContents(/*isVec*/false, /*isFloat*/true) == TargetLowering::ZeroOrOneBooleanContent && TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) == TargetLowering::ZeroOrOneBooleanContent && C1->isNullValue() && C2->isOne()) { SDValue NotCond = DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT)); if (VT.bitsEq(CondVT)) return NotCond; return DAG.getZExtOrTrunc(NotCond, DL, VT); } return SDValue(); } SDValue DAGCombiner::visitSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); EVT VT = N->getValueType(0); EVT VT0 = N0.getValueType(); SDLoc DL(N); // fold (select C, X, X) -> X if (N1 == N2) return N1; if (const ConstantSDNode *N0C = dyn_cast(N0)) { // fold (select true, X, Y) -> X // fold (select false, X, Y) -> Y return !N0C->isNullValue() ? N1 : N2; } // fold (select X, X, Y) -> (or X, Y) // fold (select X, 1, Y) -> (or C, Y) if (VT == VT0 && VT == MVT::i1 && (N0 == N1 || isOneConstant(N1))) return DAG.getNode(ISD::OR, DL, VT, N0, N2); if (SDValue V = foldSelectOfConstants(N)) return V; // fold (select C, 0, X) -> (and (not C), X) if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) { SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); AddToWorklist(NOTNode.getNode()); return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2); } // fold (select C, X, 1) -> (or (not C), X) if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) { SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT); AddToWorklist(NOTNode.getNode()); return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1); } // fold (select X, Y, X) -> (and X, Y) // fold (select X, Y, 0) -> (and X, Y) if (VT == VT0 && VT == MVT::i1 && (N0 == N2 || isNullConstant(N2))) return DAG.getNode(ISD::AND, DL, VT, N0, N1); // If we can fold this based on the true/false value, do so. if (SimplifySelectOps(N, N1, N2)) return SDValue(N, 0); // Don't revisit N. if (VT0 == MVT::i1) { // The code in this block deals with the following 2 equivalences: // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y)) // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y) // The target can specify its preferred form with the // shouldNormalizeToSelectSequence() callback. However we always transform // to the right anyway if we find the inner select exists in the DAG anyway // and we always transform to the left side if we know that we can further // optimize the combination of the conditions. bool normalizeToSequence = TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT); // select (and Cond0, Cond1), X, Y // -> select Cond0, (select Cond1, X, Y), Y if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) { SDValue Cond0 = N0->getOperand(0); SDValue Cond1 = N0->getOperand(1); SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2); if (normalizeToSequence || !InnerSelect.use_empty()) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, InnerSelect, N2); } // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y) if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) { SDValue Cond0 = N0->getOperand(0); SDValue Cond1 = N0->getOperand(1); SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2); if (normalizeToSequence || !InnerSelect.use_empty()) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1, InnerSelect); } // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) { SDValue N1_0 = N1->getOperand(0); SDValue N1_1 = N1->getOperand(1); SDValue N1_2 = N1->getOperand(2); if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) { // Create the actual and node if we can generate good code for it. if (!normalizeToSequence) { SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0); return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, N2); } // Otherwise see if we can optimize the "and" to a better pattern. if (SDValue Combined = visitANDLike(N0, N1_0, N)) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1, N2); } } // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) { SDValue N2_0 = N2->getOperand(0); SDValue N2_1 = N2->getOperand(1); SDValue N2_2 = N2->getOperand(2); if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) { // Create the actual or node if we can generate good code for it. if (!normalizeToSequence) { SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0); return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, N2_2); } // Otherwise see if we can optimize to a better pattern. if (SDValue Combined = visitORLike(N0, N2_0, N)) return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1, N2_2); } } } if (VT0 == MVT::i1) { // select (not Cond), N1, N2 -> select Cond, N2, N1 if (isBitwiseNot(N0)) return DAG.getNode(ISD::SELECT, DL, VT, N0->getOperand(0), N2, N1); } // fold selects based on a setcc into other things, such as min/max/abs if (N0.getOpcode() == ISD::SETCC) { // select x, y (fcmp lt x, y) -> fminnum x, y // select x, y (fcmp gt x, y) -> fmaxnum x, y // // This is OK if we don't care about what happens if either operand is a // NaN. // // FIXME: Instead of testing for UnsafeFPMath, this should be checking for // no signed zeros as well as no nans. const TargetOptions &Options = DAG.getTarget().Options; if (Options.UnsafeFPMath && VT.isFloatingPoint() && N0.hasOneUse() && DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) { ISD::CondCode CC = cast(N0.getOperand(2))->get(); if (SDValue FMinMax = combineMinNumMaxNum( DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG)) return FMinMax; } if ((!LegalOperations && TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) || TLI.isOperationLegal(ISD::SELECT_CC, VT)) return DAG.getNode(ISD::SELECT_CC, DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, N0.getOperand(2)); return SimplifySelect(DL, N0, N1, N2); } return SDValue(); } static std::pair SplitVSETCC(const SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); // Split the inputs. SDValue Lo, Hi, LL, LH, RL, RH; std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0); std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2)); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2)); return std::make_pair(Lo, Hi); } // This function assumes all the vselect's arguments are CONCAT_VECTOR // nodes and that the condition is a BV of ConstantSDNodes (or undefs). static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Cond = N->getOperand(0); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); EVT VT = N->getValueType(0); int NumElems = VT.getVectorNumElements(); assert(LHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getOpcode() == ISD::CONCAT_VECTORS && Cond.getOpcode() == ISD::BUILD_VECTOR); // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about // binary ones here. if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2) return SDValue(); // We're sure we have an even number of elements due to the // concat_vectors we have as arguments to vselect. // Skip BV elements until we find one that's not an UNDEF // After we find an UNDEF element, keep looping until we get to half the // length of the BV and see if all the non-undef nodes are the same. ConstantSDNode *BottomHalf = nullptr; for (int i = 0; i < NumElems / 2; ++i) { if (Cond->getOperand(i)->isUndef()) continue; if (BottomHalf == nullptr) BottomHalf = cast(Cond.getOperand(i)); else if (Cond->getOperand(i).getNode() != BottomHalf) return SDValue(); } // Do the same for the second half of the BuildVector ConstantSDNode *TopHalf = nullptr; for (int i = NumElems / 2; i < NumElems; ++i) { if (Cond->getOperand(i)->isUndef()) continue; if (TopHalf == nullptr) TopHalf = cast(Cond.getOperand(i)); else if (Cond->getOperand(i).getNode() != TopHalf) return SDValue(); } assert(TopHalf && BottomHalf && "One half of the selector was all UNDEFs and the other was all the " "same value. This should have been addressed before this function."); return DAG.getNode( ISD::CONCAT_VECTORS, DL, VT, BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0), TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1)); } SDValue DAGCombiner::visitMSCATTER(SDNode *N) { if (Level >= AfterLegalizeTypes) return SDValue(); MaskedScatterSDNode *MSC = cast(N); SDValue Mask = MSC->getMask(); SDValue Data = MSC->getValue(); SDLoc DL(N); // If the MSCATTER data type requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() != ISD::SETCC) return SDValue(); // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) != TargetLowering::TypeSplitVector) return SDValue(); SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0)); SDValue Chain = MSC->getChain(); EVT MemoryVT = MSC->getMemoryVT(); unsigned Alignment = MSC->getOriginalAlignment(); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); SDValue Scale = MSC->getScale(); SDValue BasePtr = MSC->getBasePtr(); SDValue IndexLo, IndexHi; std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MSC->getPointerInfo(), MachineMemOperand::MOStore, LoMemVT.getStoreSize(), Alignment, MSC->getAAInfo(), MSC->getRanges()); SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo, Scale }; Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(), DL, OpsLo, MMO); SDValue OpsHi[] = { Chain, DataHi, MaskHi, BasePtr, IndexHi, Scale }; Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(), DL, OpsHi, MMO); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); } SDValue DAGCombiner::visitMSTORE(SDNode *N) { if (Level >= AfterLegalizeTypes) return SDValue(); MaskedStoreSDNode *MST = dyn_cast(N); SDValue Mask = MST->getMask(); SDValue Data = MST->getValue(); EVT VT = Data.getValueType(); SDLoc DL(N); // If the MSTORE data type requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() == ISD::SETCC) { // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), VT) != TargetLowering::TypeSplitVector) return SDValue(); SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); SDValue Chain = MST->getChain(); SDValue Ptr = MST->getBasePtr(); EVT MemoryVT = MST->getMemoryVT(); unsigned Alignment = MST->getOriginalAlignment(); // if Alignment is equal to the vector size, // take the half of it for the second part unsigned SecondHalfAlignment = (Alignment == VT.getSizeInBits() / 8) ? Alignment / 2 : Alignment; EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MST->getPointerInfo(), MachineMemOperand::MOStore, LoMemVT.getStoreSize(), Alignment, MST->getAAInfo(), MST->getRanges()); Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO, MST->isTruncatingStore(), MST->isCompressingStore()); Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, MST->isCompressingStore()); unsigned HiOffset = LoMemVT.getStoreSize(); MMO = DAG.getMachineFunction().getMachineMemOperand( MST->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOStore, HiMemVT.getStoreSize(), SecondHalfAlignment, MST->getAAInfo(), MST->getRanges()); Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO, MST->isTruncatingStore(), MST->isCompressingStore()); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi); } return SDValue(); } SDValue DAGCombiner::visitMGATHER(SDNode *N) { if (Level >= AfterLegalizeTypes) return SDValue(); MaskedGatherSDNode *MGT = cast(N); SDValue Mask = MGT->getMask(); SDLoc DL(N); // If the MGATHER result requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() != ISD::SETCC) return SDValue(); EVT VT = N->getValueType(0); // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), VT) != TargetLowering::TypeSplitVector) return SDValue(); SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); SDValue Src0 = MGT->getValue(); SDValue Src0Lo, Src0Hi; std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); SDValue Chain = MGT->getChain(); EVT MemoryVT = MGT->getMemoryVT(); unsigned Alignment = MGT->getOriginalAlignment(); EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue Scale = MGT->getScale(); SDValue BasePtr = MGT->getBasePtr(); SDValue Index = MGT->getIndex(); SDValue IndexLo, IndexHi; std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MGT->getPointerInfo(), MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), Alignment, MGT->getAAInfo(), MGT->getRanges()); SDValue OpsLo[] = { Chain, Src0Lo, MaskLo, BasePtr, IndexLo, Scale }; Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo, MMO); SDValue OpsHi[] = { Chain, Src0Hi, MaskHi, BasePtr, IndexHi, Scale }; Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi, MMO); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); // Build a factor node to remember that this load is independent of the // other one. Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), Hi.getValue(1)); // Legalized the chain result - switch anything that used the old chain to // use the new one. DAG.ReplaceAllUsesOfValueWith(SDValue(MGT, 1), Chain); SDValue GatherRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); SDValue RetOps[] = { GatherRes, Chain }; return DAG.getMergeValues(RetOps, DL); } SDValue DAGCombiner::visitMLOAD(SDNode *N) { if (Level >= AfterLegalizeTypes) return SDValue(); MaskedLoadSDNode *MLD = dyn_cast(N); SDValue Mask = MLD->getMask(); SDLoc DL(N); // If the MLOAD result requires splitting and the mask is provided by a // SETCC, then split both nodes and its operands before legalization. This // prevents the type legalizer from unrolling SETCC into scalar comparisons // and enables future optimizations (e.g. min/max pattern matching on X86). if (Mask.getOpcode() == ISD::SETCC) { EVT VT = N->getValueType(0); // Check if any splitting is required. if (TLI.getTypeAction(*DAG.getContext(), VT) != TargetLowering::TypeSplitVector) return SDValue(); SDValue MaskLo, MaskHi, Lo, Hi; std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG); SDValue Src0 = MLD->getSrc0(); SDValue Src0Lo, Src0Hi; std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL); EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0)); SDValue Chain = MLD->getChain(); SDValue Ptr = MLD->getBasePtr(); EVT MemoryVT = MLD->getMemoryVT(); unsigned Alignment = MLD->getOriginalAlignment(); // if Alignment is equal to the vector size, // take the half of it for the second part unsigned SecondHalfAlignment = (Alignment == MLD->getValueType(0).getSizeInBits()/8) ? Alignment/2 : Alignment; EVT LoMemVT, HiMemVT; std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); MachineMemOperand *MMO = DAG.getMachineFunction(). getMachineMemOperand(MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), Alignment, MLD->getAAInfo(), MLD->getRanges()); Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad()); Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG, MLD->isExpandingLoad()); unsigned HiOffset = LoMemVT.getStoreSize(); MMO = DAG.getMachineFunction().getMachineMemOperand( MLD->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges()); Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad()); AddToWorklist(Lo.getNode()); AddToWorklist(Hi.getNode()); // Build a factor node to remember that this load is independent of the // other one. Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1), Hi.getValue(1)); // Legalized the chain result - switch anything that used the old chain to // use the new one. DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain); SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); SDValue RetOps[] = { LoadRes, Chain }; return DAG.getMergeValues(RetOps, DL); } return SDValue(); } /// A vector select of 2 constant vectors can be simplified to math/logic to /// avoid a variable select instruction and possibly avoid constant loads. SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) { SDValue Cond = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); EVT VT = N->getValueType(0); if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 || !TLI.convertSelectOfConstantsToMath(VT) || !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) || !ISD::isBuildVectorOfConstantSDNodes(N2.getNode())) return SDValue(); // Check if we can use the condition value to increment/decrement a single // constant value. This simplifies a select to an add and removes a constant // load/materialization from the general case. bool AllAddOne = true; bool AllSubOne = true; unsigned Elts = VT.getVectorNumElements(); for (unsigned i = 0; i != Elts; ++i) { SDValue N1Elt = N1.getOperand(i); SDValue N2Elt = N2.getOperand(i); if (N1Elt.isUndef() || N2Elt.isUndef()) continue; const APInt &C1 = cast(N1Elt)->getAPIntValue(); const APInt &C2 = cast(N2Elt)->getAPIntValue(); if (C1 != C2 + 1) AllAddOne = false; if (C1 != C2 - 1) AllSubOne = false; } // Further simplifications for the extra-special cases where the constants are // all 0 or all -1 should be implemented as folds of these patterns. SDLoc DL(N); if (AllAddOne || AllSubOne) { // vselect Cond, C+1, C --> add (zext Cond), C // vselect Cond, C-1, C --> add (sext Cond), C auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond); return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2); } // The general case for select-of-constants: // vselect Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2 // ...but that only makes sense if a vselect is slower than 2 logic ops, so // leave that to a machine-specific pass. return SDValue(); } SDValue DAGCombiner::visitVSELECT(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); SDLoc DL(N); // fold (vselect C, X, X) -> X if (N1 == N2) return N1; // Canonicalize integer abs. // vselect (setg[te] X, 0), X, -X -> // vselect (setgt X, -1), X, -X -> // vselect (setl[te] X, 0), -X, X -> // Y = sra (X, size(X)-1); xor (add (X, Y), Y) if (N0.getOpcode() == ISD::SETCC) { SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1); ISD::CondCode CC = cast(N0.getOperand(2))->get(); bool isAbs = false; bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) || (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) && N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1)) isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode()); else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) && N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1)) isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode()); if (isAbs) { EVT VT = LHS.getValueType(); if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) return DAG.getNode(ISD::ABS, DL, VT, LHS); SDValue Shift = DAG.getNode( ISD::SRA, DL, VT, LHS, DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT)); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift); AddToWorklist(Shift.getNode()); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::XOR, DL, VT, Add, Shift); } // If this select has a condition (setcc) with narrower operands than the // select, try to widen the compare to match the select width. // TODO: This should be extended to handle any constant. // TODO: This could be extended to handle non-loading patterns, but that // requires thorough testing to avoid regressions. if (isNullConstantOrNullSplatConstant(RHS)) { EVT NarrowVT = LHS.getValueType(); EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger(); EVT SetCCVT = getSetCCResultType(LHS.getValueType()); unsigned SetCCWidth = SetCCVT.getScalarSizeInBits(); unsigned WideWidth = WideVT.getScalarSizeInBits(); bool IsSigned = isSignedIntSetCC(CC); auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD; if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() && SetCCWidth != 1 && SetCCWidth < WideWidth && TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) && TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) { // Both compare operands can be widened for free. The LHS can use an // extended load, and the RHS is a constant: // vselect (ext (setcc load(X), C)), N1, N2 --> // vselect (setcc extload(X), C'), N1, N2 auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS); SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS); EVT WideSetCCVT = getSetCCResultType(WideVT); SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC); return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2); } } } if (SimplifySelectOps(N, N1, N2)) return SDValue(N, 0); // Don't revisit N. // Fold (vselect (build_vector all_ones), N1, N2) -> N1 if (ISD::isBuildVectorAllOnes(N0.getNode())) return N1; // Fold (vselect (build_vector all_zeros), N1, N2) -> N2 if (ISD::isBuildVectorAllZeros(N0.getNode())) return N2; // The ConvertSelectToConcatVector function is assuming both the above // checks for (vselect (build_vector all{ones,zeros) ...) have been made // and addressed. if (N1.getOpcode() == ISD::CONCAT_VECTORS && N2.getOpcode() == ISD::CONCAT_VECTORS && ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { if (SDValue CV = ConvertSelectToConcatVector(N, DAG)) return CV; } if (SDValue V = foldVSelectOfConstants(N)) return V; return SDValue(); } SDValue DAGCombiner::visitSELECT_CC(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); SDValue N3 = N->getOperand(3); SDValue N4 = N->getOperand(4); ISD::CondCode CC = cast(N4)->get(); // fold select_cc lhs, rhs, x, x, cc -> x if (N2 == N3) return N2; // Determine if the condition we're dealing with is constant if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, CC, SDLoc(N), false)) { AddToWorklist(SCC.getNode()); if (ConstantSDNode *SCCC = dyn_cast(SCC.getNode())) { if (!SCCC->isNullValue()) return N2; // cond always true -> true val else return N3; // cond always false -> false val } else if (SCC->isUndef()) { // When the condition is UNDEF, just return the first operand. This is // coherent the DAG creation, no setcc node is created in this case return N2; } else if (SCC.getOpcode() == ISD::SETCC) { // Fold to a simpler select_cc return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0), SCC.getOperand(1), N2, N3, SCC.getOperand(2)); } } // If we can fold this based on the true/false value, do so. if (SimplifySelectOps(N, N2, N3)) return SDValue(N, 0); // Don't revisit N. // fold select_cc into other things, such as min/max/abs return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC); } SDValue DAGCombiner::visitSETCC(SDNode *N) { // setcc is very commonly used as an argument to brcond. This pattern // also lend itself to numerous combines and, as a result, it is desired // we keep the argument to a brcond as a setcc as much as possible. bool PreferSetCC = N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND; SDValue Combined = SimplifySetCC( N->getValueType(0), N->getOperand(0), N->getOperand(1), cast(N->getOperand(2))->get(), SDLoc(N), !PreferSetCC); if (!Combined) return SDValue(); // If we prefer to have a setcc, and we don't, we'll try our best to // recreate one using rebuildSetCC. if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) { SDValue NewSetCC = rebuildSetCC(Combined); // We don't have anything interesting to combine to. if (NewSetCC.getNode() == N) return SDValue(); if (NewSetCC) return NewSetCC; } return Combined; } SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) { SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Carry = N->getOperand(2); SDValue Cond = N->getOperand(3); // If Carry is false, fold to a regular SETCC. if (isNullConstant(Carry)) return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond); return SDValue(); } /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or /// a build_vector of constants. /// This function is called by the DAGCombiner when visiting sext/zext/aext /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND). /// Vector extends are not folded if operations are legal; this is to /// avoid introducing illegal build_vector dag nodes. static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI, SelectionDAG &DAG, bool LegalTypes, bool LegalOperations) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG || Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) && "Expected EXTEND dag node in input!"); // fold (sext c1) -> c1 // fold (zext c1) -> c1 // fold (aext c1) -> c1 if (isa(N0)) return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode(); // fold (sext (build_vector AllConstants) -> (build_vector AllConstants) // fold (zext (build_vector AllConstants) -> (build_vector AllConstants) // fold (aext (build_vector AllConstants) -> (build_vector AllConstants) EVT SVT = VT.getScalarType(); if (!(VT.isVector() && (!LegalTypes || (!LegalOperations && TLI.isTypeLegal(SVT))) && ISD::isBuildVectorOfConstantSDNodes(N0.getNode()))) return nullptr; // We can fold this node into a build_vector. unsigned VTBits = SVT.getSizeInBits(); unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits(); SmallVector Elts; unsigned NumElts = VT.getVectorNumElements(); SDLoc DL(N); for (unsigned i=0; i != NumElts; ++i) { SDValue Op = N0->getOperand(i); if (Op->isUndef()) { Elts.push_back(DAG.getUNDEF(SVT)); continue; } SDLoc DL(Op); // Get the constant value and if needed trunc it to the size of the type. // Nodes like build_vector might have constants wider than the scalar type. APInt C = cast(Op)->getAPIntValue().zextOrTrunc(EVTBits); if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG) Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT)); else Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT)); } return DAG.getBuildVector(VT, DL, Elts).getNode(); } // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this: // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))" // transformation. Returns true if extension are possible and the above // mentioned transformation is profitable. static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0, unsigned ExtOpc, SmallVectorImpl &ExtendNodes, const TargetLowering &TLI) { bool HasCopyToRegUses = false; bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType()); for (SDNode::use_iterator UI = N0.getNode()->use_begin(), UE = N0.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User == N) continue; if (UI.getUse().getResNo() != N0.getResNo()) continue; // FIXME: Only extend SETCC N, N and SETCC N, c for now. if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) { ISD::CondCode CC = cast(User->getOperand(2))->get(); if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC)) // Sign bits will be lost after a zext. return false; bool Add = false; for (unsigned i = 0; i != 2; ++i) { SDValue UseOp = User->getOperand(i); if (UseOp == N0) continue; if (!isa(UseOp)) return false; Add = true; } if (Add) ExtendNodes.push_back(User); continue; } // If truncates aren't free and there are users we can't // extend, it isn't worthwhile. if (!isTruncFree) return false; // Remember if this value is live-out. if (User->getOpcode() == ISD::CopyToReg) HasCopyToRegUses = true; } if (HasCopyToRegUses) { bool BothLiveOut = false; for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; ++UI) { SDUse &Use = UI.getUse(); if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) { BothLiveOut = true; break; } } if (BothLiveOut) // Both unextended and extended values are live out. There had better be // a good reason for the transformation. return ExtendNodes.size(); } return true; } void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl &SetCCs, SDValue OrigLoad, SDValue ExtLoad, ISD::NodeType ExtType) { // Extend SetCC uses if necessary. SDLoc DL(ExtLoad); for (SDNode *SetCC : SetCCs) { SmallVector Ops; for (unsigned j = 0; j != 2; ++j) { SDValue SOp = SetCC->getOperand(j); if (SOp == OrigLoad) Ops.push_back(ExtLoad); else Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp)); } Ops.push_back(SetCC->getOperand(2)); CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops)); } } // FIXME: Bring more similar combines here, common to sext/zext (maybe aext?). SDValue DAGCombiner::CombineExtLoad(SDNode *N) { SDValue N0 = N->getOperand(0); EVT DstVT = N->getValueType(0); EVT SrcVT = N0.getValueType(); assert((N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) && "Unexpected node type (not an extend)!"); // fold (sext (load x)) to multiple smaller sextloads; same for zext. // For example, on a target with legal v4i32, but illegal v8i32, turn: // (v8i32 (sext (v8i16 (load x)))) // into: // (v8i32 (concat_vectors (v4i32 (sextload x)), // (v4i32 (sextload (x + 16))))) // Where uses of the original load, i.e.: // (v8i16 (load x)) // are replaced with: // (v8i16 (truncate // (v8i32 (concat_vectors (v4i32 (sextload x)), // (v4i32 (sextload (x + 16))))))) // // This combine is only applicable to illegal, but splittable, vectors. // All legal types, and illegal non-vector types, are handled elsewhere. // This combine is controlled by TargetLowering::isVectorLoadExtDesirable. // if (N0->getOpcode() != ISD::LOAD) return SDValue(); LoadSDNode *LN0 = cast(N0); if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) || !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() || !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0))) return SDValue(); SmallVector SetCCs; if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI)) return SDValue(); ISD::LoadExtType ExtType = N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; // Try to split the vector types to get down to legal types. EVT SplitSrcVT = SrcVT; EVT SplitDstVT = DstVT; while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) && SplitSrcVT.getVectorNumElements() > 1) { SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first; SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first; } if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) return SDValue(); SDLoc DL(N); const unsigned NumSplits = DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements(); const unsigned Stride = SplitSrcVT.getStoreSize(); SmallVector Loads; SmallVector Chains; SDValue BasePtr = LN0->getBasePtr(); for (unsigned Idx = 0; Idx < NumSplits; Idx++) { const unsigned Offset = Idx * Stride; const unsigned Align = MinAlign(LN0->getAlignment(), Offset); SDValue SplitLoad = DAG.getExtLoad( ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr, LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align, LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, DAG.getConstant(Stride, DL, BasePtr.getValueType())); Loads.push_back(SplitLoad.getValue(0)); Chains.push_back(SplitLoad.getValue(1)); } SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); // Simplify TF. AddToWorklist(NewChain.getNode()); CombineTo(N, NewValue); // Replace uses of the original load (before extension) // with a truncate of the concatenated sextloaded vectors. SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue); ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode()); CombineTo(N0.getNode(), Trunc, NewChain); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) -> // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst)) SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) { assert(N->getOpcode() == ISD::ZERO_EXTEND); EVT VT = N->getValueType(0); // and/or/xor SDValue N0 = N->getOperand(0); if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::XOR) || N0.getOperand(1).getOpcode() != ISD::Constant || (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))) return SDValue(); // shl/shr SDValue N1 = N0->getOperand(0); if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) || N1.getOperand(1).getOpcode() != ISD::Constant || (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT))) return SDValue(); // load if (!isa(N1.getOperand(0))) return SDValue(); LoadSDNode *Load = cast(N1.getOperand(0)); EVT MemVT = Load->getMemoryVT(); if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) || Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed()) return SDValue(); // If the shift op is SHL, the logic op must be AND, otherwise the result // will be wrong. if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND) return SDValue(); if (!N0.hasOneUse() || !N1.hasOneUse()) return SDValue(); SmallVector SetCCs; if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0), ISD::ZERO_EXTEND, SetCCs, TLI)) return SDValue(); // Actually do the transformation. SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT, Load->getChain(), Load->getBasePtr(), Load->getMemoryVT(), Load->getMemOperand()); SDLoc DL1(N1); SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad, N1.getOperand(1)); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); SDLoc DL0(N0); SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift, DAG.getConstant(Mask, DL0, VT)); ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND); CombineTo(N, And); if (SDValue(Load, 0).hasOneUse()) { DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1)); } else { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load), Load->getValueType(0), ExtLoad); CombineTo(Load, Trunc, ExtLoad.getValue(1)); } return SDValue(N,0); // Return N so it doesn't get rechecked! } /// If we're narrowing or widening the result of a vector select and the final /// size is the same size as a setcc (compare) feeding the select, then try to /// apply the cast operation to the select's operands because matching vector /// sizes for a select condition and other operands should be more efficient. SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) { unsigned CastOpcode = Cast->getOpcode(); assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND || CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND || CastOpcode == ISD::FP_ROUND) && "Unexpected opcode for vector select narrowing/widening"); // We only do this transform before legal ops because the pattern may be // obfuscated by target-specific operations after legalization. Do not create // an illegal select op, however, because that may be difficult to lower. EVT VT = Cast->getValueType(0); if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) return SDValue(); SDValue VSel = Cast->getOperand(0); if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() || VSel.getOperand(0).getOpcode() != ISD::SETCC) return SDValue(); // Does the setcc have the same vector size as the casted select? SDValue SetCC = VSel.getOperand(0); EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType()); if (SetCCVT.getSizeInBits() != VT.getSizeInBits()) return SDValue(); // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B) SDValue A = VSel.getOperand(1); SDValue B = VSel.getOperand(2); SDValue CastA, CastB; SDLoc DL(Cast); if (CastOpcode == ISD::FP_ROUND) { // FP_ROUND (fptrunc) has an extra flag operand to pass along. CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1)); CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1)); } else { CastA = DAG.getNode(CastOpcode, DL, VT, A); CastB = DAG.getNode(CastOpcode, DL, VT, B); } return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB); } // fold ([s|z]ext ([s|z]extload x)) -> ([s|z]ext (truncate ([s|z]extload x))) // fold ([s|z]ext ( extload x)) -> ([s|z]ext (truncate ([s|z]extload x))) static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner, const TargetLowering &TLI, EVT VT, bool LegalOperations, SDNode *N, SDValue N0, ISD::LoadExtType ExtLoadType) { SDNode *N0Node = N0.getNode(); bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node) : ISD::isZEXTLoad(N0Node); if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) || !ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse()) return {}; LoadSDNode *LN0 = cast(N0); EVT MemVT = LN0->getMemoryVT(); if ((LegalOperations || LN0->isVolatile()) && !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT)) return {}; SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); Combiner.CombineTo(N, ExtLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // fold ([s|z]ext (load x)) -> ([s|z]ext (truncate ([s|z]extload x))) // Only generate vector extloads when 1) they're legal, and 2) they are // deemed desirable by the target. static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner, const TargetLowering &TLI, EVT VT, bool LegalOperations, SDNode *N, SDValue N0, ISD::LoadExtType ExtLoadType, ISD::NodeType ExtOpc) { if (!ISD::isNON_EXTLoad(N0.getNode()) || !ISD::isUNINDEXEDLoad(N0.getNode()) || ((LegalOperations || VT.isVector() || cast(N0)->isVolatile()) && !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType()))) return {}; bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI); if (VT.isVector()) DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); if (!DoXform) return {}; LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc); // If the load value is used only by N, replace it via CombineTo N. bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse(); Combiner.CombineTo(N, ExtLoad); if (NoReplaceTrunc) { DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); } else { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1)); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG, bool LegalOperations) { assert((N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext"); SDValue SetCC = N->getOperand(0); if (LegalOperations || SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse() || SetCC.getValueType() != MVT::i1) return SDValue(); SDValue X = SetCC.getOperand(0); SDValue Ones = SetCC.getOperand(1); ISD::CondCode CC = cast(SetCC.getOperand(2))->get(); EVT VT = N->getValueType(0); EVT XVT = X.getValueType(); // setge X, C is canonicalized to setgt, so we do not need to match that // pattern. The setlt sibling is folded in SimplifySelectCC() because it does // not require the 'not' op. if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) { // Invert and smear/shift the sign bit: // sext i1 (setgt iN X, -1) --> sra (not X), (N - 1) // zext i1 (setgt iN X, -1) --> srl (not X), (N - 1) SDLoc DL(N); SDValue NotX = DAG.getNOT(DL, X, VT); SDValue ShiftAmount = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT); auto ShiftOpcode = N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL; return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount); } return SDValue(); } SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); SDLoc DL(N); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); // fold (sext (sext x)) -> (sext x) // fold (sext (aext x)) -> (sext x) if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0)); if (N0.getOpcode() == ISD::TRUNCATE) { // fold (sext (truncate (load x))) -> (sext (smaller load x)) // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n))) if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { SDNode *oye = N0.getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } // See if the value being truncated is already sign extended. If so, just // eliminate the trunc/sext pair. SDValue Op = N0.getOperand(0); unsigned OpBits = Op.getScalarValueSizeInBits(); unsigned MidBits = N0.getScalarValueSizeInBits(); unsigned DestBits = VT.getScalarSizeInBits(); unsigned NumSignBits = DAG.ComputeNumSignBits(Op); if (OpBits == DestBits) { // Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign // bits, it is already ready. if (NumSignBits > DestBits-MidBits) return Op; } else if (OpBits < DestBits) { // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign // bits, just sext from i32. if (NumSignBits > OpBits-MidBits) return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op); } else { // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign // bits, just truncate to i32. if (NumSignBits > OpBits-MidBits) return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); } // fold (sext (truncate x)) -> (sextinreg x). if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, N0.getValueType())) { if (OpBits < DestBits) Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op); else if (OpBits > DestBits) Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op); return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op, DAG.getValueType(N0.getValueType())); } } // Try to simplify (sext (load x)). if (SDValue foldedExt = tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD, ISD::SIGN_EXTEND)) return foldedExt; // fold (sext (load x)) to multiple smaller sextloads. // Only on illegal but splittable vectors. if (SDValue ExtLoad = CombineExtLoad(N)) return ExtLoad; // Try to simplify (sext (sextload x)). if (SDValue foldedExt = tryToFoldExtOfExtload( DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD)) return foldedExt; // fold (sext (and/or/xor (load x), cst)) -> // (and/or/xor (sextload x), (sext cst)) if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::XOR) && isa(N0.getOperand(0)) && N0.getOperand(1).getOpcode() == ISD::Constant && (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { LoadSDNode *LN00 = cast(N0.getOperand(0)); EVT MemVT = LN00->getMemoryVT(); if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) && LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) { SmallVector SetCCs; bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0), ISD::SIGN_EXTEND, SetCCs, TLI); if (DoXform) { SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT, LN00->getChain(), LN00->getBasePtr(), LN00->getMemoryVT(), LN00->getMemOperand()); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.sext(VT.getSizeInBits()); SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, ExtLoad, DAG.getConstant(Mask, DL, VT)); ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND); bool NoReplaceTruncAnd = !N0.hasOneUse(); bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse(); CombineTo(N, And); // If N0 has multiple uses, change other uses as well. if (NoReplaceTruncAnd) { SDValue TruncAnd = DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); CombineTo(N0.getNode(), TruncAnd); } if (NoReplaceTrunc) { DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1)); } else { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00), LN00->getValueType(0), ExtLoad); CombineTo(LN00, Trunc, ExtLoad.getValue(1)); } return SDValue(N,0); // Return N so it doesn't get rechecked! } } } if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations)) return V; if (N0.getOpcode() == ISD::SETCC) { SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); ISD::CondCode CC = cast(N0.getOperand(2))->get(); EVT N00VT = N0.getOperand(0).getValueType(); // sext(setcc) -> sext_in_reg(vsetcc) for vectors. // Only do this before legalize for now. if (VT.isVector() && !LegalOperations && TLI.getBooleanContents(N00VT) == TargetLowering::ZeroOrNegativeOneBooleanContent) { // On some architectures (such as SSE/NEON/etc) the SETCC result type is // of the same size as the compared operands. Only optimize sext(setcc()) // if this is the case. EVT SVT = getSetCCResultType(N00VT); // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result // for that matter). Check to see that they are the same size. If so, // we know that the element size of the sext'd result matches the // element size of the compare operands. if (VT.getSizeInBits() == SVT.getSizeInBits()) return DAG.getSetCC(DL, VT, N00, N01, CC); // If the desired elements are smaller or larger than the source // elements, we can use a matching integer vector type and then // truncate/sign extend. EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); if (SVT == MatchingVecType) { SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC); return DAG.getSExtOrTrunc(VsetCC, DL, VT); } } // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) // Here, T can be 1 or -1, depending on the type of the setcc and // getBooleanContents(). unsigned SetCCWidth = N0.getScalarValueSizeInBits(); // To determine the "true" side of the select, we need to know the high bit // of the value returned by the setcc if it evaluates to true. // If the type of the setcc is i1, then the true case of the select is just // sext(i1 1), that is, -1. // If the type of the setcc is larger (say, i8) then the value of the high // bit depends on getBooleanContents(), so ask TLI for a real "true" value // of the appropriate width. SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT) : DAG.getBoolConstant(true, DL, VT, N00VT); SDValue Zero = DAG.getConstant(0, DL, VT); if (SDValue SCC = SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true)) return SCC; if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) { EVT SetCCVT = getSetCCResultType(N00VT); // Don't do this transform for i1 because there's a select transform // that would reverse it. // TODO: We should not do this transform at all without a target hook // because a sext is likely cheaper than a select? if (SetCCVT.getScalarSizeInBits() != 1 && (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) { SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC); return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero); } } } // fold (sext x) -> (zext x) if the sign bit is known zero. if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) && DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0); if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; return SDValue(); } // isTruncateOf - If N is a truncate of some other value, return true, record // the value being truncated in Op and which of Op's bits are zero/one in Known. // This function computes KnownBits to avoid a duplicated call to // computeKnownBits in the caller. static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op, KnownBits &Known) { if (N->getOpcode() == ISD::TRUNCATE) { Op = N->getOperand(0); DAG.computeKnownBits(Op, Known); return true; } if (N->getOpcode() != ISD::SETCC || N->getValueType(0) != MVT::i1 || cast(N->getOperand(2))->get() != ISD::SETNE) return false; SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); assert(Op0.getValueType() == Op1.getValueType()); if (isNullConstant(Op0)) Op = Op1; else if (isNullConstant(Op1)) Op = Op0; else return false; DAG.computeKnownBits(Op, Known); if (!(Known.Zero | 1).isAllOnesValue()) return false; return true; } SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); // fold (zext (zext x)) -> (zext x) // fold (zext (aext x)) -> (zext x) if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0)); // fold (zext (truncate x)) -> (zext x) or // (zext (truncate x)) -> (truncate x) // This is valid when the truncated bits of x are already zero. // FIXME: We should extend this to work for vectors too. SDValue Op; KnownBits Known; if (!VT.isVector() && isTruncateOf(DAG, N0, Op, Known)) { APInt TruncatedBits = (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ? APInt(Op.getValueSizeInBits(), 0) : APInt::getBitsSet(Op.getValueSizeInBits(), N0.getValueSizeInBits(), std::min(Op.getValueSizeInBits(), VT.getSizeInBits())); if (TruncatedBits.isSubsetOf(Known.Zero)) return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); } // fold (zext (truncate x)) -> (and x, mask) if (N0.getOpcode() == ISD::TRUNCATE) { // fold (zext (truncate (load x))) -> (zext (smaller load x)) // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n))) if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { SDNode *oye = N0.getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } EVT SrcVT = N0.getOperand(0).getValueType(); EVT MinVT = N0.getValueType(); // Try to mask before the extension to avoid having to generate a larger mask, // possibly over several sub-vectors. if (SrcVT.bitsLT(VT) && VT.isVector()) { if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) && TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) { SDValue Op = N0.getOperand(0); Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); AddToWorklist(Op.getNode()); SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT); // Transfer the debug info; the new node is equivalent to N0. DAG.transferDbgValues(N0, ZExtOrTrunc); return ZExtOrTrunc; } } if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); AddToWorklist(Op.getNode()); SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); // We may safely transfer the debug info describing the truncate node over // to the equivalent and operation. DAG.transferDbgValues(N0, And); return And; } } // Fold (zext (and (trunc x), cst)) -> (and x, cst), // if either of the casts is not free. if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::TRUNCATE && N0.getOperand(1).getOpcode() == ISD::Constant && (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), N0.getValueType()) || !TLI.isZExtFree(N0.getValueType(), VT))) { SDValue X = N0.getOperand(0).getOperand(0); X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); SDLoc DL(N); return DAG.getNode(ISD::AND, DL, VT, X, DAG.getConstant(Mask, DL, VT)); } // Try to simplify (zext (load x)). if (SDValue foldedExt = tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD, ISD::ZERO_EXTEND)) return foldedExt; // fold (zext (load x)) to multiple smaller zextloads. // Only on illegal but splittable vectors. if (SDValue ExtLoad = CombineExtLoad(N)) return ExtLoad; // fold (zext (and/or/xor (load x), cst)) -> // (and/or/xor (zextload x), (zext cst)) // Unless (and (load x) cst) will match as a zextload already and has // additional users. if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::XOR) && isa(N0.getOperand(0)) && N0.getOperand(1).getOpcode() == ISD::Constant && (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) { LoadSDNode *LN00 = cast(N0.getOperand(0)); EVT MemVT = LN00->getMemoryVT(); if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) && LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) { if (N0.getOpcode() == ISD::AND) { auto *AndC = cast(N0.getOperand(1)); EVT LoadResultTy = AndC->getValueType(0); EVT ExtVT; if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT)) DoXform = false; } } if (DoXform) DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0), ISD::ZERO_EXTEND, SetCCs, TLI); if (DoXform) { SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT, LN00->getChain(), LN00->getBasePtr(), LN00->getMemoryVT(), LN00->getMemOperand()); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); SDLoc DL(N); SDValue And = DAG.getNode(N0.getOpcode(), DL, VT, ExtLoad, DAG.getConstant(Mask, DL, VT)); ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND); bool NoReplaceTruncAnd = !N0.hasOneUse(); bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse(); CombineTo(N, And); // If N0 has multiple uses, change other uses as well. if (NoReplaceTruncAnd) { SDValue TruncAnd = DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And); CombineTo(N0.getNode(), TruncAnd); } if (NoReplaceTrunc) { DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1)); } else { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00), LN00->getValueType(0), ExtLoad); CombineTo(LN00, Trunc, ExtLoad.getValue(1)); } return SDValue(N,0); // Return N so it doesn't get rechecked! } } } // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) -> // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst)) if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N)) return ZExtLoad; // Try to simplify (zext (zextload x)). if (SDValue foldedExt = tryToFoldExtOfExtload( DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD)) return foldedExt; if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations)) return V; if (N0.getOpcode() == ISD::SETCC) { // Only do this before legalize for now. if (!LegalOperations && VT.isVector() && N0.getValueType().getVectorElementType() == MVT::i1) { EVT N00VT = N0.getOperand(0).getValueType(); if (getSetCCResultType(N00VT) == N0.getValueType()) return SDValue(); // We know that the # elements of the results is the same as the # // elements of the compare (and the # elements of the compare result for // that matter). Check to see that they are the same size. If so, we know // that the element size of the sext'd result matches the element size of // the compare operands. SDLoc DL(N); SDValue VecOnes = DAG.getConstant(1, DL, VT); if (VT.getSizeInBits() == N00VT.getSizeInBits()) { // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors. SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0), N0.getOperand(1), N0.getOperand(2)); return DAG.getNode(ISD::AND, DL, VT, VSetCC, VecOnes); } // If the desired elements are smaller or larger than the source // elements we can use a matching integer vector type and then // truncate/sign extend. EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); SDValue VsetCC = DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0), N0.getOperand(1), N0.getOperand(2)); return DAG.getNode(ISD::AND, DL, VT, DAG.getSExtOrTrunc(VsetCC, DL, VT), VecOnes); } // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc SDLoc DL(N); if (SDValue SCC = SimplifySelectCC( DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT), cast(N0.getOperand(2))->get(), true)) return SCC; } // (zext (shl (zext x), cst)) -> (shl (zext x), cst) if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) && isa(N0.getOperand(1)) && N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse()) { SDValue ShAmt = N0.getOperand(1); unsigned ShAmtVal = cast(ShAmt)->getZExtValue(); if (N0.getOpcode() == ISD::SHL) { SDValue InnerZExt = N0.getOperand(0); // If the original shl may be shifting out bits, do not perform this // transformation. unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() - InnerZExt.getOperand(0).getValueSizeInBits(); if (ShAmtVal > KnownZeroBits) return SDValue(); } SDLoc DL(N); // Ensure that the shift amount is wide enough for the shifted value. if (VT.getSizeInBits() >= 256) ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt); return DAG.getNode(N0.getOpcode(), DL, VT, DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)), ShAmt); } if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; return SDValue(); } SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); // fold (aext (aext x)) -> (aext x) // fold (aext (zext x)) -> (zext x) // fold (aext (sext x)) -> (sext x) if (N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND) return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); // fold (aext (truncate (load x))) -> (aext (smaller load x)) // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n))) if (N0.getOpcode() == ISD::TRUNCATE) { if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) { SDNode *oye = N0.getOperand(0).getNode(); if (NarrowLoad.getNode() != N0.getNode()) { CombineTo(N0.getNode(), NarrowLoad); // CombineTo deleted the truncate, if needed, but not what's under it. AddToWorklist(oye); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (aext (truncate x)) if (N0.getOpcode() == ISD::TRUNCATE) return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); // Fold (aext (and (trunc x), cst)) -> (and x, cst) // if the trunc is not free. if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::TRUNCATE && N0.getOperand(1).getOpcode() == ISD::Constant && !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(), N0.getValueType())) { SDLoc DL(N); SDValue X = N0.getOperand(0).getOperand(0); X = DAG.getAnyExtOrTrunc(X, DL, VT); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, X, DAG.getConstant(Mask, DL, VT)); } // fold (aext (load x)) -> (aext (truncate (extload x))) // None of the supported targets knows how to perform load and any_ext // on vectors in one instruction. We only perform this transformation on // scalars. if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && ISD::isUNINDEXEDLoad(N0.getNode()) && TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND); // If the load value is used only by N, replace it via CombineTo N. bool NoReplaceTrunc = N0.hasOneUse(); CombineTo(N, ExtLoad); if (NoReplaceTrunc) { DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); } else { SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad); CombineTo(LN0, Trunc, ExtLoad.getValue(1)); } return SDValue(N, 0); // Return N so it doesn't get rechecked! } } // fold (aext (zextload x)) -> (aext (truncate (zextload x))) // fold (aext (sextload x)) -> (aext (truncate (sextload x))) // fold (aext ( extload x)) -> (aext (truncate (extload x))) if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) { LoadSDNode *LN0 = cast(N0); ISD::LoadExtType ExtType = LN0->getExtensionType(); EVT MemVT = LN0->getMemoryVT(); if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) { SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), MemVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } if (N0.getOpcode() == ISD::SETCC) { // For vectors: // aext(setcc) -> vsetcc // aext(setcc) -> truncate(vsetcc) // aext(setcc) -> aext(vsetcc) // Only do this before legalize for now. if (VT.isVector() && !LegalOperations) { EVT N00VT = N0.getOperand(0).getValueType(); if (getSetCCResultType(N00VT) == N0.getValueType()) return SDValue(); // We know that the # elements of the results is the same as the // # elements of the compare (and the # elements of the compare result // for that matter). Check to see that they are the same size. If so, // we know that the element size of the sext'd result matches the // element size of the compare operands. if (VT.getSizeInBits() == N00VT.getSizeInBits()) return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); // If the desired elements are smaller or larger than the source // elements we can use a matching integer vector type and then // truncate/any extend else { EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger(); SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT); } } // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc SDLoc DL(N); if (SDValue SCC = SimplifySelectCC( DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT), DAG.getConstant(0, DL, VT), cast(N0.getOperand(2))->get(), true)) return SCC; } return SDValue(); } SDValue DAGCombiner::visitAssertExt(SDNode *N) { unsigned Opcode = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT AssertVT = cast(N1)->getVT(); // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt) if (N0.getOpcode() == Opcode && AssertVT == cast(N0.getOperand(1))->getVT()) return N0; if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() && N0.getOperand(0).getOpcode() == Opcode) { // We have an assert, truncate, assert sandwich. Make one stronger assert // by asserting on the smallest asserted type to the larger source type. // This eliminates the later assert: // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN SDValue BigA = N0.getOperand(0); EVT BigA_AssertVT = cast(BigA.getOperand(1))->getVT(); assert(BigA_AssertVT.bitsLE(N0.getValueType()) && "Asserting zero/sign-extended bits to a type larger than the " "truncated destination does not provide information"); SDLoc DL(N); EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT; SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT); SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(), BigA.getOperand(0), MinAssertVTVal); return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert); } return SDValue(); } /// If the result of a wider load is shifted to right of N bits and then /// truncated to a narrower type and where N is a multiple of number of bits of /// the narrower type, transform it to a narrower load from address + N / num of /// bits of new type. Also narrow the load if the result is masked with an AND /// to effectively produce a smaller type. If the result is to be extended, also /// fold the extension to form a extending load. SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) { unsigned Opc = N->getOpcode(); ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT ExtVT = VT; // This transformation isn't valid for vector loads. if (VT.isVector()) return SDValue(); // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then // extended to VT. if (Opc == ISD::SIGN_EXTEND_INREG) { ExtType = ISD::SEXTLOAD; ExtVT = cast(N->getOperand(1))->getVT(); } else if (Opc == ISD::SRL) { // Another special-case: SRL is basically zero-extending a narrower value, // or it maybe shifting a higher subword, half or byte into the lowest // bits. ExtType = ISD::ZEXTLOAD; N0 = SDValue(N, 0); auto *LN0 = dyn_cast(N0.getOperand(0)); auto *N01 = dyn_cast(N0.getOperand(1)); if (!N01 || !LN0) return SDValue(); uint64_t ShiftAmt = N01->getZExtValue(); uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits(); if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt) ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt); else ExtVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() - ShiftAmt); } else if (Opc == ISD::AND) { // An AND with a constant mask is the same as a truncate + zero-extend. auto AndC = dyn_cast(N->getOperand(1)); if (!AndC || !AndC->getAPIntValue().isMask()) return SDValue(); unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes(); ExtType = ISD::ZEXTLOAD; ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits); } unsigned ShAmt = 0; if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) { SDValue SRL = N0; if (auto *ConstShift = dyn_cast(SRL.getOperand(1))) { ShAmt = ConstShift->getZExtValue(); unsigned EVTBits = ExtVT.getSizeInBits(); // Is the shift amount a multiple of size of VT? if ((ShAmt & (EVTBits-1)) == 0) { N0 = N0.getOperand(0); // Is the load width a multiple of size of VT? if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0) return SDValue(); } // At this point, we must have a load or else we can't do the transform. if (!isa(N0)) return SDValue(); auto *LN0 = cast(N0); // Because a SRL must be assumed to *need* to zero-extend the high bits // (as opposed to anyext the high bits), we can't combine the zextload // lowering of SRL and an sextload. if (LN0->getExtensionType() == ISD::SEXTLOAD) return SDValue(); // If the shift amount is larger than the input type then we're not // accessing any of the loaded bytes. If the load was a zextload/extload // then the result of the shift+trunc is zero/undef (handled elsewhere). if (ShAmt >= LN0->getMemoryVT().getSizeInBits()) return SDValue(); // If the SRL is only used by a masking AND, we may be able to adjust // the ExtVT to make the AND redundant. SDNode *Mask = *(SRL->use_begin()); if (Mask->getOpcode() == ISD::AND && isa(Mask->getOperand(1))) { const APInt &ShiftMask = cast(Mask->getOperand(1))->getAPIntValue(); if (ShiftMask.isMask()) { EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(), ShiftMask.countTrailingOnes()); // If the mask is smaller, recompute the type. if ((ExtVT.getSizeInBits() > MaskedVT.getSizeInBits()) && TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT)) ExtVT = MaskedVT; } } } } // If the load is shifted left (and the result isn't shifted back right), // we can fold the truncate through the shift. unsigned ShLeftAmt = 0; if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() && ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) { if (ConstantSDNode *N01 = dyn_cast(N0.getOperand(1))) { ShLeftAmt = N01->getZExtValue(); N0 = N0.getOperand(0); } } // If we haven't found a load, we can't narrow it. if (!isa(N0)) return SDValue(); LoadSDNode *LN0 = cast(N0); if (!isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt)) return SDValue(); // For big endian targets, we need to adjust the offset to the pointer to // load the correct bytes. if (DAG.getDataLayout().isBigEndian()) { unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits(); unsigned EVTStoreBits = ExtVT.getStoreSizeInBits(); ShAmt = LVTStoreBits - EVTStoreBits - ShAmt; } EVT PtrType = N0.getOperand(1).getValueType(); uint64_t PtrOff = ShAmt / 8; unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff); SDLoc DL(LN0); // The original load itself didn't wrap, so an offset within it doesn't. SDNodeFlags Flags; Flags.setNoUnsignedWrap(true); SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, LN0->getBasePtr(), DAG.getConstant(PtrOff, DL, PtrType), Flags); AddToWorklist(NewPtr.getNode()); SDValue Load; if (ExtType == ISD::NON_EXTLOAD) Load = DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign, LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); else Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(), NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT, NewAlign, LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); // Replace the old load's chain with the new load's chain. WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); // Shift the result left, if we've swallowed a left shift. SDValue Result = Load; if (ShLeftAmt != 0) { EVT ShImmTy = getShiftAmountTy(Result.getValueType()); if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt)) ShImmTy = VT; // If the shift amount is as large as the result size (but, presumably, // no larger than the source) then the useful bits of the result are // zero; we can't simply return the shortened shift, because the result // of that operation is undefined. SDLoc DL(N0); if (ShLeftAmt >= VT.getSizeInBits()) Result = DAG.getConstant(0, DL, VT); else Result = DAG.getNode(ISD::SHL, DL, VT, Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy)); } // Return the new loaded value. return Result; } SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); EVT EVT = cast(N1)->getVT(); unsigned VTBits = VT.getScalarSizeInBits(); unsigned EVTBits = EVT.getScalarSizeInBits(); if (N0.isUndef()) return DAG.getUNDEF(VT); // fold (sext_in_reg c1) -> c1 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1); // If the input is already sign extended, just drop the extension. if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1) return N0; // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2 if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG && EVT.bitsLT(cast(N0.getOperand(1))->getVT())) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0), N1); // fold (sext_in_reg (sext x)) -> (sext x) // fold (sext_in_reg (aext x)) -> (sext x) // if x is small enough. if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getScalarValueSizeInBits() <= EVTBits && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); } // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x) if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) { if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)) return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT); } // fold (sext_in_reg (zext x)) -> (sext x) // iff we are extending the source sign bit. if (N0.getOpcode() == ISD::ZERO_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getScalarValueSizeInBits() == EVTBits && (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT))) return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1); } // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero. if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1))) return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType()); // fold operands of sext_in_reg based on knowledge that the top bits are not // demanded. if (SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // fold (sext_in_reg (load x)) -> (smaller sextload x) // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits)) if (SDValue NarrowLoad = ReduceLoadWidth(N)) return NarrowLoad; // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24) // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible. // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above. if (N0.getOpcode() == ISD::SRL) { if (ConstantSDNode *ShAmt = dyn_cast(N0.getOperand(1))) if (ShAmt->getZExtValue()+EVTBits <= VTBits) { // We can turn this into an SRA iff the input to the SRL is already sign // extended enough. unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0)); if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits) return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1)); } } // fold (sext_inreg (extload x)) -> (sextload x) // If sextload is not supported by target, we can only do the combine when // load has one use. Doing otherwise can block folding the extload with other // extends that the target does support. if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && EVT == cast(N0)->getMemoryVT() && ((!LegalOperations && !cast(N0)->isVolatile() && N0.hasOneUse()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), EVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); AddToWorklist(ExtLoad.getNode()); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse() && EVT == cast(N0)->getMemoryVT() && ((!LegalOperations && !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), EVT, LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16)) if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) { if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), N0.getOperand(1), false)) return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1); } return SDValue(); } SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.isUndef()) return DAG.getUNDEF(VT); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); return SDValue(); } SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.isUndef()) return DAG.getUNDEF(VT); if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes, LegalOperations)) return SDValue(Res, 0); return SDValue(); } SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); bool isLE = DAG.getDataLayout().isLittleEndian(); // noop truncate if (N0.getValueType() == N->getValueType(0)) return N0; // fold (truncate (truncate x)) -> (truncate x) if (N0.getOpcode() == ISD::TRUNCATE) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); // fold (truncate c1) -> c1 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) { SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0); if (C.getNode() != N) return C; } // fold (truncate (ext x)) -> (ext x) or (truncate x) or x if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) { // if the source is smaller than the dest, we still need an extend. if (N0.getOperand(0).getValueType().bitsLT(VT)) return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0)); // if the source is larger than the dest, than we just need the truncate. if (N0.getOperand(0).getValueType().bitsGT(VT)) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0)); // if the source and dest are the same type, we can drop both the extend // and the truncate. return N0.getOperand(0); } // If this is anyext(trunc), don't fold it, allow ourselves to be folded. if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND)) return SDValue(); // Fold extract-and-trunc into a narrow extract. For example: // i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1) // i32 y = TRUNCATE(i64 x) // -- becomes -- // v16i8 b = BITCAST (v2i64 val) // i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8) // // Note: We only run this optimization after type legalization (which often // creates this pattern) and before operation legalization after which // we need to be more careful about the vector instructions that we generate. if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) { EVT VecTy = N0.getOperand(0).getValueType(); EVT ExTy = N0.getValueType(); EVT TrTy = N->getValueType(0); unsigned NumElem = VecTy.getVectorNumElements(); unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem); assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size"); SDValue EltNo = N0->getOperand(1); if (isa(EltNo) && isTypeLegal(NVT)) { int Elt = cast(EltNo)->getZExtValue(); EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout()); int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1)); SDLoc DL(N); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy, DAG.getBitcast(NVT, N0.getOperand(0)), DAG.getConstant(Index, DL, IndexTy)); } } // trunc (select c, a, b) -> select c, (trunc a), (trunc b) if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) { EVT SrcVT = N0.getValueType(); if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) && TLI.isTruncateFree(SrcVT, VT)) { SDLoc SL(N0); SDValue Cond = N0.getOperand(0); SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2)); return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1); } } // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits() if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) && TLI.isTypeDesirableForOp(ISD::SHL, VT)) { SDValue Amt = N0.getOperand(1); KnownBits Known; DAG.computeKnownBits(Amt, Known); unsigned Size = VT.getScalarSizeInBits(); if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) { SDLoc SL(N); EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout()); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); if (AmtVT != Amt.getValueType()) { Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT); AddToWorklist(Amt.getNode()); } return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt); } } // Fold a series of buildvector, bitcast, and truncate if possible. // For example fold // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to // (2xi32 (buildvector x, y)). if (Level == AfterLegalizeVectorOps && VT.isVector() && N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR && N0.getOperand(0).hasOneUse()) { SDValue BuildVect = N0.getOperand(0); EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType(); EVT TruncVecEltTy = VT.getVectorElementType(); // Check that the element types match. if (BuildVectEltTy == TruncVecEltTy) { // Now we only need to compute the offset of the truncated elements. unsigned BuildVecNumElts = BuildVect.getNumOperands(); unsigned TruncVecNumElts = VT.getVectorNumElements(); unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts; assert((BuildVecNumElts % TruncVecNumElts) == 0 && "Invalid number of elements"); SmallVector Opnds; for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset) Opnds.push_back(BuildVect.getOperand(i)); return DAG.getBuildVector(VT, SDLoc(N), Opnds); } } // See if we can simplify the input to this truncate through knowledge that // only the low bits are being used. // For example "trunc (or (shl x, 8), y)" // -> trunc y // Currently we only perform this optimization on scalars because vectors // may have different active low bits. if (!VT.isVector()) { APInt Mask = APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits()); if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask)) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter); } // fold (truncate (load x)) -> (smaller load x) // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits)) if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) { if (SDValue Reduced = ReduceLoadWidth(N)) return Reduced; // Handle the case where the load remains an extending load even // after truncation. if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) { LoadSDNode *LN0 = cast(N0); if (!LN0->isVolatile() && LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) { SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0), VT, LN0->getChain(), LN0->getBasePtr(), LN0->getMemoryVT(), LN0->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1)); return NewLoad; } } } // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)), // where ... are all 'undef'. if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) { SmallVector VTs; SDValue V; unsigned Idx = 0; unsigned NumDefs = 0; for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) { SDValue X = N0.getOperand(i); if (!X.isUndef()) { V = X; Idx = i; NumDefs++; } // Stop if more than one members are non-undef. if (NumDefs > 1) break; VTs.push_back(EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), X.getValueType().getVectorNumElements())); } if (NumDefs == 0) return DAG.getUNDEF(VT); if (NumDefs == 1) { assert(V.getNode() && "The single defined operand is empty!"); SmallVector Opnds; for (unsigned i = 0, e = VTs.size(); i != e; ++i) { if (i != Idx) { Opnds.push_back(DAG.getUNDEF(VTs[i])); continue; } SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V); AddToWorklist(NV.getNode()); Opnds.push_back(NV); } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds); } } // Fold truncate of a bitcast of a vector to an extract of the low vector // element. // // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) { SDValue VecSrc = N0.getOperand(0); EVT SrcVT = VecSrc.getValueType(); if (SrcVT.isVector() && SrcVT.getScalarType() == VT && (!LegalOperations || TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, SrcVT))) { SDLoc SL(N); EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout()); unsigned Idx = isLE ? 0 : SrcVT.getVectorNumElements() - 1; return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc, DAG.getConstant(Idx, SL, IdxVT)); } } // Simplify the operands using demanded-bits information. if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0))) return SDValue(N, 0); // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry) // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry) // When the adde's carry is not used. if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) && N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) && (!LegalOperations || TLI.isOperationLegal(N0.getOpcode(), VT))) { SDLoc SL(N); auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0)); auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1)); auto VTs = DAG.getVTList(VT, N0->getValueType(1)); return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2)); } // fold (truncate (extract_subvector(ext x))) -> // (extract_subvector x) // TODO: This can be generalized to cover cases where the truncate and extract // do not fully cancel each other out. if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::SIGN_EXTEND || N00.getOpcode() == ISD::ZERO_EXTEND || N00.getOpcode() == ISD::ANY_EXTEND) { if (N00.getOperand(0)->getValueType(0).getVectorElementType() == VT.getVectorElementType()) return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT, N00.getOperand(0), N0.getOperand(1)); } } if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; return SDValue(); } static SDNode *getBuildPairElt(SDNode *N, unsigned i) { SDValue Elt = N->getOperand(i); if (Elt.getOpcode() != ISD::MERGE_VALUES) return Elt.getNode(); return Elt.getOperand(Elt.getResNo()).getNode(); } /// build_pair (load, load) -> load /// if load locations are consecutive. SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) { assert(N->getOpcode() == ISD::BUILD_PAIR); LoadSDNode *LD1 = dyn_cast(getBuildPairElt(N, 0)); LoadSDNode *LD2 = dyn_cast(getBuildPairElt(N, 1)); // A BUILD_PAIR is always having the least significant part in elt 0 and the // most significant part in elt 1. So when combining into one large load, we // need to consider the endianness. if (DAG.getDataLayout().isBigEndian()) std::swap(LD1, LD2); if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() || LD1->getAddressSpace() != LD2->getAddressSpace()) return SDValue(); EVT LD1VT = LD1->getValueType(0); unsigned LD1Bytes = LD1VT.getStoreSize(); if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() && DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) { unsigned Align = LD1->getAlignment(); unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( VT.getTypeForEVT(*DAG.getContext())); if (NewAlign <= Align && (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(), LD1->getPointerInfo(), Align); } return SDValue(); } static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) { // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi // and Lo parts; on big-endian machines it doesn't. return DAG.getDataLayout().isBigEndian() ? 1 : 0; } static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { // If this is not a bitcast to an FP type or if the target doesn't have // IEEE754-compliant FP logic, we're done. EVT VT = N->getValueType(0); if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT)) return SDValue(); // TODO: Use splat values for the constant-checking below and remove this // restriction. SDValue N0 = N->getOperand(0); EVT SourceVT = N0.getValueType(); if (SourceVT.isVector()) return SDValue(); unsigned FPOpcode; APInt SignMask; switch (N0.getOpcode()) { case ISD::AND: FPOpcode = ISD::FABS; SignMask = ~APInt::getSignMask(SourceVT.getSizeInBits()); break; case ISD::XOR: FPOpcode = ISD::FNEG; SignMask = APInt::getSignMask(SourceVT.getSizeInBits()); break; // TODO: ISD::OR --> ISD::FNABS? default: return SDValue(); } // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X SDValue LogicOp0 = N0.getOperand(0); ConstantSDNode *LogicOp1 = dyn_cast(N0.getOperand(1)); if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask && LogicOp0.getOpcode() == ISD::BITCAST && LogicOp0->getOperand(0).getValueType() == VT) return DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0->getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitBITCAST(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.isUndef()) return DAG.getUNDEF(VT); // If the input is a BUILD_VECTOR with all constant elements, fold this now. // Only do this before legalize, since afterward the target may be depending // on the bitconvert. // First check to see if this is all constant. if (!LegalTypes && N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() && VT.isVector()) { bool isSimple = cast(N0)->isConstant(); EVT DestEltVT = N->getValueType(0).getVectorElementType(); assert(!DestEltVT.isVector() && "Element type of vector ValueType must not be vector!"); if (isSimple) return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), DestEltVT); } // If the input is a constant, let getNode fold it. // We always need to check that this is just a fp -> int or int -> conversion // otherwise we will get back N which will confuse the caller into thinking // we used CombineTo. This can block target combines from running. If we can't // allowed legal operations, we need to ensure the resulting operation will be // legal. // TODO: Maybe we should check that the return value isn't N explicitly? if ((isa(N0) && VT.isFloatingPoint() && !VT.isVector() && (!LegalOperations || TLI.isOperationLegal(ISD::ConstantFP, VT))) || (isa(N0) && VT.isInteger() && !VT.isVector() && (!LegalOperations || TLI.isOperationLegal(ISD::Constant, VT)))) return DAG.getBitcast(VT, N0); // (conv (conv x, t1), t2) -> (conv x, t2) if (N0.getOpcode() == ISD::BITCAST) return DAG.getBitcast(VT, N0.getOperand(0)); // fold (conv (load x)) -> (load (conv*)x) // If the resultant load doesn't need a higher alignment than the original! if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile() && // Do not remove the cast if the types differ in endian layout. TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) == TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) && (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) && TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) { LoadSDNode *LN0 = cast(N0); unsigned OrigAlign = LN0->getAlignment(); bool Fast = false; if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, LN0->getAddressSpace(), OrigAlign, &Fast) && Fast) { SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), LN0->getPointerInfo(), OrigAlign, LN0->getMemOperand()->getFlags(), LN0->getAAInfo()); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); return Load; } } if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI)) return V; // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit) // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit)) // // For ppc_fp128: // fold (bitcast (fneg x)) -> // flipbit = signbit // (xor (bitcast x) (build_pair flipbit, flipbit)) // // fold (bitcast (fabs x)) -> // flipbit = (and (extract_element (bitcast x), 0), signbit) // (xor (bitcast x) (build_pair flipbit, flipbit)) // This often reduces constant pool loads. if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) || (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) && N0.getNode()->hasOneUse() && VT.isInteger() && !VT.isVector() && !N0.getValueType().isVector()) { SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0)); AddToWorklist(NewConv.getNode()); SDLoc DL(N); if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { assert(VT.getSizeInBits() == 128); SDValue SignBit = DAG.getConstant( APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64); SDValue FlipBit; if (N0.getOpcode() == ISD::FNEG) { FlipBit = SignBit; AddToWorklist(FlipBit.getNode()); } else { assert(N0.getOpcode() == ISD::FABS); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv, DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), SDLoc(NewConv))); AddToWorklist(Hi.getNode()); FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit); AddToWorklist(FlipBit.getNode()); } SDValue FlipBits = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); AddToWorklist(FlipBits.getNode()); return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits); } APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); if (N0.getOpcode() == ISD::FNEG) return DAG.getNode(ISD::XOR, DL, VT, NewConv, DAG.getConstant(SignBit, DL, VT)); assert(N0.getOpcode() == ISD::FABS); return DAG.getNode(ISD::AND, DL, VT, NewConv, DAG.getConstant(~SignBit, DL, VT)); } // fold (bitconvert (fcopysign cst, x)) -> // (or (and (bitconvert x), sign), (and cst, (not sign))) // Note that we don't handle (copysign x, cst) because this can always be // folded to an fneg or fabs. // // For ppc_fp128: // fold (bitcast (fcopysign cst, x)) -> // flipbit = (and (extract_element // (xor (bitcast cst), (bitcast x)), 0), // signbit) // (xor (bitcast cst) (build_pair flipbit, flipbit)) if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() && isa(N0.getOperand(0)) && VT.isInteger() && !VT.isVector()) { unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits(); EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth); if (isTypeLegal(IntXVT)) { SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1)); AddToWorklist(X.getNode()); // If X has a different width than the result/lhs, sext it or truncate it. unsigned VTWidth = VT.getSizeInBits(); if (OrigXWidth < VTWidth) { X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X); AddToWorklist(X.getNode()); } else if (OrigXWidth > VTWidth) { // To get the sign bit in the right place, we have to shift it right // before truncating. SDLoc DL(X); X = DAG.getNode(ISD::SRL, DL, X.getValueType(), X, DAG.getConstant(OrigXWidth-VTWidth, DL, X.getValueType())); AddToWorklist(X.getNode()); X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); AddToWorklist(X.getNode()); } if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) { APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2); SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); AddToWorklist(Cst.getNode()); SDValue X = DAG.getBitcast(VT, N0.getOperand(1)); AddToWorklist(X.getNode()); SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X); AddToWorklist(XorResult.getNode()); SDValue XorResult64 = DAG.getNode( ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult, DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG), SDLoc(XorResult))); AddToWorklist(XorResult64.getNode()); SDValue FlipBit = DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64, DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64)); AddToWorklist(FlipBit.getNode()); SDValue FlipBits = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit); AddToWorklist(FlipBits.getNode()); return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits); } APInt SignBit = APInt::getSignMask(VT.getSizeInBits()); X = DAG.getNode(ISD::AND, SDLoc(X), VT, X, DAG.getConstant(SignBit, SDLoc(X), VT)); AddToWorklist(X.getNode()); SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0)); Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT, Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT)); AddToWorklist(Cst.getNode()); return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst); } } // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive. if (N0.getOpcode() == ISD::BUILD_PAIR) if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT)) return CombineLD; // Remove double bitcasts from shuffles - this is often a legacy of // XformToShuffleWithZero being used to combine bitmaskings (of // float vectors bitcast to integer vectors) into shuffles. // bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1) if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() && N0->getOpcode() == ISD::VECTOR_SHUFFLE && VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() && !(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) { ShuffleVectorSDNode *SVN = cast(N0); // If operands are a bitcast, peek through if it casts the original VT. // If operands are a constant, just bitcast back to original VT. auto PeekThroughBitcast = [&](SDValue Op) { if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT) return SDValue(Op.getOperand(0)); if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode())) return DAG.getBitcast(VT, Op); return SDValue(); }; // FIXME: If either input vector is bitcast, try to convert the shuffle to // the result type of this bitcast. This would eliminate at least one // bitcast. See the transform in InstCombine. SDValue SV0 = PeekThroughBitcast(N0->getOperand(0)); SDValue SV1 = PeekThroughBitcast(N0->getOperand(1)); if (!(SV0 && SV1)) return SDValue(); int MaskScale = VT.getVectorNumElements() / N0.getValueType().getVectorNumElements(); SmallVector NewMask; for (int M : SVN->getMask()) for (int i = 0; i != MaskScale; ++i) NewMask.push_back(M < 0 ? -1 : M * MaskScale + i); bool LegalMask = TLI.isShuffleMaskLegal(NewMask, VT); if (!LegalMask) { std::swap(SV0, SV1); ShuffleVectorSDNode::commuteMask(NewMask); LegalMask = TLI.isShuffleMaskLegal(NewMask, VT); } if (LegalMask) return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask); } return SDValue(); } SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) { EVT VT = N->getValueType(0); return CombineConsecutiveLoads(N, VT); } /// We know that BV is a build_vector node with Constant, ConstantFP or Undef /// operands. DstEltVT indicates the destination element value type. SDValue DAGCombiner:: ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) { EVT SrcEltVT = BV->getValueType(0).getVectorElementType(); // If this is already the right type, we're done. if (SrcEltVT == DstEltVT) return SDValue(BV, 0); unsigned SrcBitSize = SrcEltVT.getSizeInBits(); unsigned DstBitSize = DstEltVT.getSizeInBits(); // If this is a conversion of N elements of one type to N elements of another // type, convert each element. This handles FP<->INT cases. if (SrcBitSize == DstBitSize) { EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, BV->getValueType(0).getVectorNumElements()); // Due to the FP element handling below calling this routine recursively, // we can end up with a scalar-to-vector node here. if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT, DAG.getBitcast(DstEltVT, BV->getOperand(0))); SmallVector Ops; for (SDValue Op : BV->op_values()) { // If the vector element type is not legal, the BUILD_VECTOR operands // are promoted and implicitly truncated. Make that explicit here. if (Op.getValueType() != SrcEltVT) Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op); Ops.push_back(DAG.getBitcast(DstEltVT, Op)); AddToWorklist(Ops.back().getNode()); } return DAG.getBuildVector(VT, SDLoc(BV), Ops); } // Otherwise, we're growing or shrinking the elements. To avoid having to // handle annoying details of growing/shrinking FP values, we convert them to // int first. if (SrcEltVT.isFloatingPoint()) { // Convert the input float vector to a int vector where the elements are the // same sizes. EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits()); BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode(); SrcEltVT = IntVT; } // Now we know the input is an integer vector. If the output is a FP type, // convert to integer first, then to FP of the right size. if (DstEltVT.isFloatingPoint()) { EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits()); SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode(); // Next, convert to FP elements of the same size. return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT); } SDLoc DL(BV); // Okay, we know the src/dst types are both integers of differing types. // Handling growing first. assert(SrcEltVT.isInteger() && DstEltVT.isInteger()); if (SrcBitSize < DstBitSize) { unsigned NumInputsPerOutput = DstBitSize/SrcBitSize; SmallVector Ops; for (unsigned i = 0, e = BV->getNumOperands(); i != e; i += NumInputsPerOutput) { bool isLE = DAG.getDataLayout().isLittleEndian(); APInt NewBits = APInt(DstBitSize, 0); bool EltIsUndef = true; for (unsigned j = 0; j != NumInputsPerOutput; ++j) { // Shift the previously computed bits over. NewBits <<= SrcBitSize; SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j)); if (Op.isUndef()) continue; EltIsUndef = false; NewBits |= cast(Op)->getAPIntValue(). zextOrTrunc(SrcBitSize).zext(DstBitSize); } if (EltIsUndef) Ops.push_back(DAG.getUNDEF(DstEltVT)); else Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT)); } EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size()); return DAG.getBuildVector(VT, DL, Ops); } // Finally, this must be the case where we are shrinking elements: each input // turns into multiple outputs. unsigned NumOutputsPerInput = SrcBitSize/DstBitSize; EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, NumOutputsPerInput*BV->getNumOperands()); SmallVector Ops; for (const SDValue &Op : BV->op_values()) { if (Op.isUndef()) { Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT)); continue; } APInt OpVal = cast(Op)-> getAPIntValue().zextOrTrunc(SrcBitSize); for (unsigned j = 0; j != NumOutputsPerInput; ++j) { APInt ThisVal = OpVal.trunc(DstBitSize); Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT)); OpVal.lshrInPlace(DstBitSize); } // For big endian targets, swap the order of the pieces of each element. if (DAG.getDataLayout().isBigEndian()) std::reverse(Ops.end()-NumOutputsPerInput, Ops.end()); } return DAG.getBuildVector(VT, DL, Ops); } static bool isContractable(SDNode *N) { SDNodeFlags F = N->getFlags(); return F.hasAllowContract() || F.hasAllowReassociation(); } /// Try to perform FMA combining on a given FADD node. SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; // Floating-point multiply-add with intermediate rounding. bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); SDNodeFlags Flags = N->getFlags(); bool CanFuse = Options.UnsafeFPMath || isContractable(N); bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || CanFuse || HasFMAD); // If the addition is not contractable, do not combine. if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { if (N.getOpcode() != ISD::FMUL) return false; return AllowFusionGlobally || isContractable(N.getNode()); }; // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)), // prefer to fold the multiply with fewer uses. if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) { if (N0.getNode()->use_size() > N1.getNode()->use_size()) std::swap(N0, N1); } // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), N1, Flags); } // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), N0, Flags); } // Look through FP_EXTEND nodes to do more combining. // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), N1, Flags); } } // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x) // Note: Commutes FADD operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0, Flags); } } // More folding opportunities when target permits. if (Aggressive) { // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && N0.getOperand(2).getOpcode() == ISD::FMUL && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), N1, Flags), Flags); } // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x)) if (CanFuse && N1->getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FMUL && N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0), N1.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(2).getOperand(0), N1.getOperand(2).getOperand(1), N0, Flags), Flags); } // fold (fadd (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y, (fma (fpext u), (fpext v), z)) auto FoldFAddFMAFPExtFMul = [&] ( SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, SDNodeFlags Flags) { return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, U), DAG.getNode(ISD::FP_EXTEND, SL, VT, V), Z, Flags), Flags); }; if (N0.getOpcode() == PreferredFusedOpcode) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); if (isContractableFMUL(N020) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1), N020.getOperand(0), N020.getOperand(1), N1, Flags); } } } // fold (fadd (fpext (fma x, y, (fmul u, v))), z) // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z)) // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. auto FoldFAddFPExtFMAFMul = [&] ( SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z, SDNodeFlags Flags) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, X), DAG.getNode(ISD::FP_EXTEND, SL, VT, Y), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, U), DAG.getNode(ISD::FP_EXTEND, SL, VT, V), Z, Flags), Flags); }; if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); if (isContractableFMUL(N002) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1), N002.getOperand(0), N002.getOperand(1), N1, Flags); } } } // fold (fadd x, (fma y, z, (fpext (fmul u, v))) // -> (fma y, z, (fma (fpext u), (fpext v), x)) if (N1.getOpcode() == PreferredFusedOpcode) { SDValue N12 = N1.getOperand(2); if (N12.getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N12.getOperand(0); if (isContractableFMUL(N120) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1), N120.getOperand(0), N120.getOperand(1), N0, Flags); } } } // fold (fadd x, (fpext (fma y, z, (fmul u, v))) // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x)) // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); if (N10.getOpcode() == PreferredFusedOpcode) { SDValue N102 = N10.getOperand(2); if (isContractableFMUL(N102) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1), N102.getOperand(0), N102.getOperand(1), N0, Flags); } } } } return SDValue(); } /// Try to perform FMA combining on a given FSUB node. SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); const TargetOptions &Options = DAG.getTarget().Options; // Floating-point multiply-add with intermediate rounding. bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // Floating-point multiply-add without intermediate rounding. bool HasFMA = TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); const SDNodeFlags Flags = N->getFlags(); bool CanFuse = Options.UnsafeFPMath || isContractable(N); bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast || CanFuse || HasFMAD); // If the subtraction is not contractable, do not combine. if (!AllowFusionGlobally && !isContractable(N)) return SDValue(); const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo(); if (STI && STI->generateFMAsInMachineCombiner(OptLevel)) return SDValue(); // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. auto isContractableFMUL = [AllowFusionGlobally](SDValue N) { if (N.getOpcode() != ISD::FMUL) return false; return AllowFusionGlobally || isContractable(N.getNode()); }; // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); } // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), N0, Flags); } // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z)) if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) && (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N00), N01, DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); } // Look through FP_EXTEND nodes to do more combining. // fold (fsub (fpext (fmul x, y)), z) // -> (fma (fpext x), (fpext y), (fneg z)) if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (isContractableFMUL(N00) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), DAG.getNode(ISD::FNEG, SL, VT, N1), Flags); } } // fold (fsub x, (fpext (fmul y, z))) // -> (fma (fneg (fpext y)), (fpext z), x) // Note: Commutes FSUB operands. if (N1.getOpcode() == ISD::FP_EXTEND) { SDValue N10 = N1.getOperand(0); if (isContractableFMUL(N10) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0))), DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0, Flags); } } // fold (fsub (fpext (fneg (fmul, x, y))), z) // -> (fneg (fma (fpext x), (fpext y), z)) // Note: This could be removed with appropriate canonicalization of the // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent // from implementing the canonicalization in visitFSUB. if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FNEG) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), N1, Flags)); } } } // fold (fsub (fneg (fpext (fmul, x, y))), z) // -> (fneg (fma (fpext x)), (fpext y), z) // Note: This could be removed with appropriate canonicalization of the // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent // from implementing the canonicalization in visitFSUB. if (N0.getOpcode() == ISD::FNEG) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == ISD::FP_EXTEND) { SDValue N000 = N00.getOperand(0); if (isContractableFMUL(N000) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N000.getValueType())) { return DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)), N1, Flags)); } } } // More folding opportunities when target permits. if (Aggressive) { // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) if (CanFuse && N0.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(2).getOperand(0), N0.getOperand(2).getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, N1), Flags), Flags); } // fold (fsub x, (fma y, z, (fmul u, v))) // -> (fma (fneg y), z, (fma (fneg u), v, x)) if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N1.getOperand(2))) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N20), N21, N0, Flags), Flags); } // fold (fsub (fma x, y, (fpext (fmul u, v))), z) // -> (fma x, y (fma (fpext u), (fpext v), (fneg z))) if (N0.getOpcode() == PreferredFusedOpcode) { SDValue N02 = N0.getOperand(2); if (N02.getOpcode() == ISD::FP_EXTEND) { SDValue N020 = N02.getOperand(0); if (isContractableFMUL(N020) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)), DAG.getNode(ISD::FNEG, SL, VT, N1), Flags), Flags); } } } // fold (fsub (fpext (fma x, y, (fmul u, v))), z) // -> (fma (fpext x), (fpext y), // (fma (fpext u), (fpext v), (fneg z))) // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. if (N0.getOpcode() == ISD::FP_EXTEND) { SDValue N00 = N0.getOperand(0); if (N00.getOpcode() == PreferredFusedOpcode) { SDValue N002 = N00.getOperand(2); if (isContractableFMUL(N002) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) { return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(0)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)), DAG.getNode(ISD::FNEG, SL, VT, N1), Flags), Flags); } } } // fold (fsub x, (fma y, z, (fpext (fmul u, v)))) // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x)) if (N1.getOpcode() == PreferredFusedOpcode && N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) { SDValue N120 = N1.getOperand(2).getOperand(0); if (isContractableFMUL(N120) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) { SDValue N1200 = N120.getOperand(0); SDValue N1201 = N120.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N1200)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N1201), N0, Flags), Flags); } } // fold (fsub x, (fpext (fma y, z, (fmul u, v)))) // -> (fma (fneg (fpext y)), (fpext z), // (fma (fneg (fpext u)), (fpext v), x)) // FIXME: This turns two single-precision and one double-precision // operation into two double-precision operations, which might not be // interesting for all targets, especially GPUs. if (N1.getOpcode() == ISD::FP_EXTEND && N1.getOperand(0).getOpcode() == PreferredFusedOpcode) { SDValue CvtSrc = N1.getOperand(0); SDValue N100 = CvtSrc.getOperand(0); SDValue N101 = CvtSrc.getOperand(1); SDValue N102 = CvtSrc.getOperand(2); if (isContractableFMUL(N102) && TLI.isFPExtFoldable(PreferredFusedOpcode, VT, CvtSrc.getValueType())) { SDValue N1020 = N102.getOperand(0); SDValue N1021 = N102.getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N100)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N101), DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, N1020)), DAG.getNode(ISD::FP_EXTEND, SL, VT, N1021), N0, Flags), Flags); } } } return SDValue(); } /// Try to perform FMA combining on a given FMUL node based on the distributive /// law x * (y + 1) = x * y + x and variants thereof (commuted versions, /// subtraction instead of addition). SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); SDLoc SL(N); const SDNodeFlags Flags = N->getFlags(); assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation"); const TargetOptions &Options = DAG.getTarget().Options; // The transforms below are incorrect when x == 0 and y == inf, because the // intermediate multiplication produces a nan. if (!Options.NoInfsFPMath) return SDValue(); // Floating-point multiply-add without intermediate rounding. bool HasFMA = (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) && TLI.isFMAFasterThanFMulAndFAdd(VT) && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT)); // Floating-point multiply-add with intermediate rounding. This can result // in a less precise result due to the changed rounding order. bool HasFMAD = Options.UnsafeFPMath && (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)); // No valid opcode, do not combine. if (!HasFMAD && !HasFMA) return SDValue(); // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y) // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y)) auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) { auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); if (XC1 && XC1->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y, Flags); if (XC1 && XC1->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); } return SDValue(); }; if (SDValue FMA = FuseFADD(N0, N1, Flags)) return FMA; if (SDValue FMA = FuseFADD(N1, N0, Flags)) return FMA; // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y) // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y)) // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y)) // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y) auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) { if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) { auto XC0 = isConstOrConstSplatFP(X.getOperand(0)); if (XC0 && XC0->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, Y, Flags); if (XC0 && XC0->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y, DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); auto XC1 = isConstOrConstSplatFP(X.getOperand(1)); if (XC1 && XC1->isExactlyValue(+1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, DAG.getNode(ISD::FNEG, SL, VT, Y), Flags); if (XC1 && XC1->isExactlyValue(-1.0)) return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y, Flags); } return SDValue(); }; if (SDValue FMA = FuseFSUB(N0, N1, Flags)) return FMA; if (SDValue FMA = FuseFSUB(N1, N0, Flags)) return FMA; return SDValue(); } static bool isFMulNegTwo(SDValue &N) { if (N.getOpcode() != ISD::FMUL) return false; if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1))) return CFP->isExactlyValue(-2.0); return false; } SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0); bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (fadd c1, c2) -> c1 + c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags); // canonicalize constant to RHS if (N0CFP && !N1CFP) return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // fold (fadd A, (fneg B)) -> (fsub A, B) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2) return DAG.getNode(ISD::FSUB, DL, VT, N0, GetNegatedExpression(N1, DAG, LegalOperations), Flags); // fold (fadd (fneg A), B) -> (fsub B, A) if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) && isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2) return DAG.getNode(ISD::FSUB, DL, VT, N1, GetNegatedExpression(N0, DAG, LegalOperations), Flags); // fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B)) // fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B)) if ((isFMulNegTwo(N0) && N0.hasOneUse()) || (isFMulNegTwo(N1) && N1.hasOneUse())) { bool N1IsFMul = isFMulNegTwo(N1); SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0); SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags); return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags); } ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1); if (N1C && N1C->isZero()) { if (N1C->isNegative() || Options.UnsafeFPMath || Flags.hasNoSignedZeros()) { // fold (fadd A, 0) -> A return N0; } } // No FP constant should be created after legalization as Instruction // Selection pass has a hard time dealing with FP constants. bool AllowNewConst = (Level < AfterLegalizeDAG); // If 'unsafe math' or nnan is enabled, fold lots of things. if ((Options.UnsafeFPMath || Flags.hasNoNaNs()) && AllowNewConst) { // If allowed, fold (fadd (fneg x), x) -> 0.0 if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) return DAG.getConstantFP(0.0, DL, VT); // If allowed, fold (fadd x, (fneg x)) -> 0.0 if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0) return DAG.getConstantFP(0.0, DL, VT); } // If 'unsafe math' or reassoc and nsz, fold lots of things. // TODO: break out portions of the transformations below for which Unsafe is // considered and which do not require both nsz and reassoc if ((Options.UnsafeFPMath || (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && AllowNewConst) { // fadd (fadd x, c1), c2 -> fadd x, c1 + c2 if (N1CFP && N0.getOpcode() == ISD::FADD && isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, Flags); return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC, Flags); } // We can fold chains of FADD's of the same value into multiplications. // This transform is not safe in general because we are reducing the number // of rounding steps. if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) { if (N0.getOpcode() == ISD::FMUL) { bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)); // (fadd (fmul x, c), x) -> (fmul x, c+1) if (CFP01 && !CFP00 && N0.getOperand(0) == N1) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), DAG.getConstantFP(1.0, DL, VT), Flags); return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags); } // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2) if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD && N1.getOperand(0) == N1.getOperand(1) && N0.getOperand(0) == N1.getOperand(0)) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), DAG.getConstantFP(2.0, DL, VT), Flags); return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags); } } if (N1.getOpcode() == ISD::FMUL) { bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1)); // (fadd x, (fmul x, c)) -> (fmul x, c+1) if (CFP11 && !CFP10 && N1.getOperand(0) == N0) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), DAG.getConstantFP(1.0, DL, VT), Flags); return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags); } // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2) if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1) && N1.getOperand(0) == N0.getOperand(0)) { SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1), DAG.getConstantFP(2.0, DL, VT), Flags); return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags); } } if (N0.getOpcode() == ISD::FADD) { bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0)); // (fadd (fadd x, x), x) -> (fmul x, 3.0) if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) && (N0.getOperand(0) == N1)) { return DAG.getNode(ISD::FMUL, DL, VT, N1, DAG.getConstantFP(3.0, DL, VT), Flags); } } if (N1.getOpcode() == ISD::FADD) { bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0)); // (fadd x, (fadd x, x)) -> (fmul x, 3.0) if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) && N1.getOperand(0) == N0) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getConstantFP(3.0, DL, VT), Flags); } } // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0) if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD && N0.getOperand(0) == N0.getOperand(1) && N1.getOperand(0) == N1.getOperand(1) && N0.getOperand(0) == N1.getOperand(0)) { return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), DAG.getConstantFP(4.0, DL, VT), Flags); } } } // enable-unsafe-fp-math // FADD -> FMA combines: if (SDValue Fused = visitFADDForFMACombine(N)) { AddToWorklist(Fused.getNode()); return Fused; } return SDValue(); } SDValue DAGCombiner::visitFSUB(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (fsub c1, c2) -> c1-c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; // (fsub A, 0) -> A if (N1CFP && N1CFP->isZero()) { if (!N1CFP->isNegative() || Options.UnsafeFPMath || Flags.hasNoSignedZeros()) { return N0; } } if (N0 == N1) { // (fsub x, x) -> 0.0 if (Options.UnsafeFPMath || Flags.hasNoNaNs()) return DAG.getConstantFP(0.0f, DL, VT); } // (fsub 0, B) -> -B if (N0CFP && N0CFP->isZero()) { if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) { if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) return GetNegatedExpression(N1, DAG, LegalOperations); if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags); } } // fold (fsub A, (fneg B)) -> (fadd A, B) if (isNegatibleForFree(N1, LegalOperations, TLI, &Options)) return DAG.getNode(ISD::FADD, DL, VT, N0, GetNegatedExpression(N1, DAG, LegalOperations), Flags); // If 'unsafe math' is enabled, fold lots of things. if (Options.UnsafeFPMath) { // (fsub x, (fadd x, y)) -> (fneg y) // (fsub x, (fadd y, x)) -> (fneg y) if (N1.getOpcode() == ISD::FADD) { SDValue N10 = N1->getOperand(0); SDValue N11 = N1->getOperand(1); if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI, &Options)) return GetNegatedExpression(N11, DAG, LegalOperations); if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI, &Options)) return GetNegatedExpression(N10, DAG, LegalOperations); } } // FSUB -> FMA combines: if (SDValue Fused = visitFSUBForFMACombine(N)) { AddToWorklist(Fused.getNode()); return Fused; } return SDValue(); } SDValue DAGCombiner::visitFMUL(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; const SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) { // This just handles C1 * C2 for vectors. Other vector folds are below. if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; } // fold (fmul c1, c2) -> c1*c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags); // canonicalize constant to RHS if (isConstantFPBuildVectorOrConstantFP(N0) && !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags); // fold (fmul A, 1.0) -> A if (N1CFP && N1CFP->isExactlyValue(1.0)) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; if (Options.UnsafeFPMath || (Flags.hasNoNaNs() && Flags.hasNoSignedZeros())) { // fold (fmul A, 0) -> 0 if (N1CFP && N1CFP->isZero()) return N1; } if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) { // fmul (fmul X, C1), C2 -> fmul X, C1 * C2 if (N0.getOpcode() == ISD::FMUL) { // Fold scalars or any vector constants (not just splats). // This fold is done in general by InstCombine, but extra fmul insts // may have been generated during lowering. SDValue N00 = N0.getOperand(0); SDValue N01 = N0.getOperand(1); auto *BV1 = dyn_cast(N1); auto *BV00 = dyn_cast(N00); auto *BV01 = dyn_cast(N01); // Check 1: Make sure that the first operand of the inner multiply is NOT // a constant. Otherwise, we may induce infinite looping. if (!(isConstOrConstSplatFP(N00) || (BV00 && BV00->isConstant()))) { // Check 2: Make sure that the second operand of the inner multiply and // the second operand of the outer multiply are constants. if ((N1CFP && isConstOrConstSplatFP(N01)) || (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) { SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags); return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags); } } } // Match a special-case: we convert X * 2.0 into fadd. // fmul (fadd X, X), C -> fmul X, 2.0 * C if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() && N0.getOperand(0) == N0.getOperand(1)) { const SDValue Two = DAG.getConstantFP(2.0, DL, VT); SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags); return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags); } } // fold (fmul X, 2.0) -> (fadd X, X) if (N1CFP && N1CFP->isExactlyValue(+2.0)) return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags); // fold (fmul X, -1.0) -> (fneg X) if (N1CFP && N1CFP->isExactlyValue(-1.0)) if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, DL, VT, N0); // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y) if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) { if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) { // Both can be negated for free, check to see if at least one is cheaper // negated. if (LHSNeg == 2 || RHSNeg == 2) return DAG.getNode(ISD::FMUL, DL, VT, GetNegatedExpression(N0, DAG, LegalOperations), GetNegatedExpression(N1, DAG, LegalOperations), Flags); } } // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X)) // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X) if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() && (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) && TLI.isOperationLegal(ISD::FABS, VT)) { SDValue Select = N0, X = N1; if (Select.getOpcode() != ISD::SELECT) std::swap(Select, X); SDValue Cond = Select.getOperand(0); auto TrueOpnd = dyn_cast(Select.getOperand(1)); auto FalseOpnd = dyn_cast(Select.getOperand(2)); if (TrueOpnd && FalseOpnd && Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X && isa(Cond.getOperand(1)) && cast(Cond.getOperand(1))->isExactlyValue(0.0)) { ISD::CondCode CC = cast(Cond.getOperand(2))->get(); switch (CC) { default: break; case ISD::SETOLT: case ISD::SETULT: case ISD::SETOLE: case ISD::SETULE: case ISD::SETLT: case ISD::SETLE: std::swap(TrueOpnd, FalseOpnd); LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETUGT: case ISD::SETOGE: case ISD::SETUGE: case ISD::SETGT: case ISD::SETGE: if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) && TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, DL, VT, DAG.getNode(ISD::FABS, DL, VT, X)); if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0)) return DAG.getNode(ISD::FABS, DL, VT, X); break; } } } // FMUL -> FMA combines: if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) { AddToWorklist(Fused.getNode()); return Fused; } return SDValue(); } SDValue DAGCombiner::visitFMA(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; // FMA nodes have flags that propagate to the created nodes. const SDNodeFlags Flags = N->getFlags(); bool UnsafeFPMath = Options.UnsafeFPMath || isContractable(N); // Constant fold FMA. if (isa(N0) && isa(N1) && isa(N2)) { return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2); } if (UnsafeFPMath) { if (N0CFP && N0CFP->isZero()) return N2; if (N1CFP && N1CFP->isZero()) return N2; } // TODO: The FMA node should have flags that propagate to these nodes. if (N0CFP && N0CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2); if (N1CFP && N1CFP->isExactlyValue(1.0)) return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2); // Canonicalize (fma c, x, y) -> (fma x, c, y) if (isConstantFPBuildVectorOrConstantFP(N0) && !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2); if (UnsafeFPMath) { // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2) if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) && isConstantFPBuildVectorOrConstantFP(N1) && isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1), Flags), Flags); } // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y) if (N0.getOpcode() == ISD::FMUL && isConstantFPBuildVectorOrConstantFP(N1) && isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) { return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1), Flags), N2); } } // (fma x, 1, y) -> (fadd x, y) // (fma x, -1, y) -> (fadd (fneg x), y) if (N1CFP) { if (N1CFP->isExactlyValue(1.0)) // TODO: The FMA node should have flags that propagate to this node. return DAG.getNode(ISD::FADD, DL, VT, N0, N2); if (N1CFP->isExactlyValue(-1.0) && (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) { SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0); AddToWorklist(RHSNeg.getNode()); // TODO: The FMA node should have flags that propagate to this node. return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg); } // fma (fneg x), K, y -> fma x -K, y if (N0.getOpcode() == ISD::FNEG && (TLI.isOperationLegal(ISD::ConstantFP, VT) || (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) { return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0), DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2); } } if (UnsafeFPMath) { // (fma x, c, x) -> (fmul x, (c+1)) if (N1CFP && N0 == N2) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(1.0, DL, VT), Flags), Flags); } // (fma x, c, (fneg x)) -> (fmul x, (c-1)) if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) { return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(-1.0, DL, VT), Flags), Flags); } } return SDValue(); } // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal. // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip) // Notice that this is not always beneficial. One reason is different targets // may have different costs for FDIV and FMUL, so sometimes the cost of two // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL". SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) { bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath; const SDNodeFlags Flags = N->getFlags(); if (!UnsafeMath && !Flags.hasAllowReciprocal()) return SDValue(); // Skip if current node is a reciprocal. SDValue N0 = N->getOperand(0); ConstantFPSDNode *N0CFP = dyn_cast(N0); if (N0CFP && N0CFP->isExactlyValue(1.0)) return SDValue(); // Exit early if the target does not want this transform or if there can't // possibly be enough uses of the divisor to make the transform worthwhile. SDValue N1 = N->getOperand(1); unsigned MinUses = TLI.combineRepeatedFPDivisors(); if (!MinUses || N1->use_size() < MinUses) return SDValue(); // Find all FDIV users of the same divisor. // Use a set because duplicates may be present in the user list. SetVector Users; for (auto *U : N1->uses()) { if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) { // This division is eligible for optimization only if global unsafe math // is enabled or if this division allows reciprocal formation. if (UnsafeMath || U->getFlags().hasAllowReciprocal()) Users.insert(U); } } // Now that we have the actual number of divisor uses, make sure it meets // the minimum threshold specified by the target. if (Users.size() < MinUses) return SDValue(); EVT VT = N->getValueType(0); SDLoc DL(N); SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags); // Dividend / Divisor -> Dividend * Reciprocal for (auto *U : Users) { SDValue Dividend = U->getOperand(0); if (Dividend != FPOne) { SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend, Reciprocal, Flags); CombineTo(U, NewNode); } else if (U != Reciprocal.getNode()) { // In the absence of fast-math-flags, this user node is always the // same node as Reciprocal, but with FMF they may be different nodes. CombineTo(U, Reciprocal); } } return SDValue(N, 0); // N was replaced. } SDValue DAGCombiner::visitFDIV(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); SDLoc DL(N); const TargetOptions &Options = DAG.getTarget().Options; SDNodeFlags Flags = N->getFlags(); // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) return FoldedVOp; // fold (fdiv c1, c2) -> c1/c2 if (N0CFP && N1CFP) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) { // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable. if (N1CFP) { // Compute the reciprocal 1.0 / c2. const APFloat &N1APF = N1CFP->getValueAPF(); APFloat Recip(N1APF.getSemantics(), 1); // 1.0 APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven); // Only do the transform if the reciprocal is a legal fp immediate that // isn't too nasty (eg NaN, denormal, ...). if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty (!LegalOperations || // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM // backend)... we should handle this gracefully after Legalize. // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) || TLI.isOperationLegal(ISD::ConstantFP, VT) || TLI.isFPImmLegal(Recip, VT))) return DAG.getNode(ISD::FMUL, DL, VT, N0, DAG.getConstantFP(Recip, DL, VT), Flags); } // If this FDIV is part of a reciprocal square root, it may be folded // into a target-specific square root estimate instruction. if (N1.getOpcode() == ISD::FSQRT) { if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) { return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } else if (N1.getOpcode() == ISD::FP_EXTEND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) { RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } else if (N1.getOpcode() == ISD::FP_ROUND && N1.getOperand(0).getOpcode() == ISD::FSQRT) { if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) { RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1)); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } else if (N1.getOpcode() == ISD::FMUL) { // Look through an FMUL. Even though this won't remove the FDIV directly, // it's still worthwhile to get rid of the FSQRT if possible. SDValue SqrtOp; SDValue OtherOp; if (N1.getOperand(0).getOpcode() == ISD::FSQRT) { SqrtOp = N1.getOperand(0); OtherOp = N1.getOperand(1); } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) { SqrtOp = N1.getOperand(1); OtherOp = N1.getOperand(0); } if (SqrtOp.getNode()) { // We found a FSQRT, so try to make this fold: // x / (y * sqrt(z)) -> x * (rsqrt(z) / y) if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) { RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags); AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } } // Fold into a reciprocal estimate and multiply instead of a real divide. if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) { AddToWorklist(RV.getNode()); return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags); } } // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) { if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) { // Both can be negated for free, check to see if at least one is cheaper // negated. if (LHSNeg == 2 || RHSNeg == 2) return DAG.getNode(ISD::FDIV, SDLoc(N), VT, GetNegatedExpression(N0, DAG, LegalOperations), GetNegatedExpression(N1, DAG, LegalOperations), Flags); } } if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N)) return CombineRepeatedDivisors; return SDValue(); } SDValue DAGCombiner::visitFREM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); // fold (frem c1, c2) -> fmod(c1,c2) if (N0CFP && N1CFP) return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags()); if (SDValue NewSel = foldBinOpIntoSelect(N)) return NewSel; return SDValue(); } SDValue DAGCombiner::visitFSQRT(SDNode *N) { SDNodeFlags Flags = N->getFlags(); if (!DAG.getTarget().Options.UnsafeFPMath && !Flags.hasApproximateFuncs()) return SDValue(); SDValue N0 = N->getOperand(0); if (TLI.isFsqrtCheap(N0, DAG)) return SDValue(); // FSQRT nodes have flags that propagate to the created nodes. return buildSqrtEstimate(N0, Flags); } /// copysign(x, fp_extend(y)) -> copysign(x, y) /// copysign(x, fp_round(y)) -> copysign(x, y) static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) { SDValue N1 = N->getOperand(1); if ((N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND)) { // Do not optimize out type conversion of f128 type yet. // For some targets like x86_64, configuration is changed to keep one f128 // value in one SSE register, but instruction selection cannot handle // FCOPYSIGN on SSE registers yet. EVT N1VT = N1->getValueType(0); EVT N1Op0VT = N1->getOperand(0).getValueType(); return (N1VT == N1Op0VT || N1Op0VT != MVT::f128); } return false; } SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); if (N0CFP && N1CFP) // Constant fold return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1); if (N1CFP) { const APFloat &V = N1CFP->getValueAPF(); // copysign(x, c1) -> fabs(x) iff ispos(c1) // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1) if (!V.isNegative()) { if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT)) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); } else { if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT)) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0)); } } // copysign(fabs(x), y) -> copysign(x, y) // copysign(fneg(x), y) -> copysign(x, y) // copysign(copysign(x,z), y) -> copysign(x, y) if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1); // copysign(x, abs(y)) -> abs(x) if (N1.getOpcode() == ISD::FABS) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); // copysign(x, copysign(y,z)) -> copysign(x, z) if (N1.getOpcode() == ISD::FCOPYSIGN) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1)); // copysign(x, fp_extend(y)) -> copysign(x, y) // copysign(x, fp_round(y)) -> copysign(x, y) if (CanCombineFCOPYSIGN_EXTEND_ROUND(N)) return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0)); return SDValue(); } static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI) { // This optimization is guarded by a function attribute because it may produce // unexpected results. Ie, programs may be relying on the platform-specific // undefined behavior when the float-to-int conversion overflows. const Function &F = DAG.getMachineFunction().getFunction(); Attribute StrictOverflow = F.getFnAttribute("strict-float-cast-overflow"); if (StrictOverflow.getValueAsString().equals("false")) return SDValue(); // We only do this if the target has legal ftrunc. Otherwise, we'd likely be // replacing casts with a libcall. We also must be allowed to ignore -0.0 // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer // conversions would return +0.0. // FIXME: We should be able to use node-level FMF here. // TODO: If strict math, should we use FABS (+ range check for signed cast)? EVT VT = N->getValueType(0); if (!TLI.isOperationLegal(ISD::FTRUNC, VT) || !DAG.getTarget().Options.NoSignedZerosFPMath) return SDValue(); // fptosi/fptoui round towards zero, so converting from FP to integer and // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X SDValue N0 = N->getOperand(0); if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT && N0.getOperand(0).getValueType() == VT) return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0)); if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT && N0.getOperand(0).getValueType() == VT) return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0)); return SDValue(); } SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT OpVT = N0.getValueType(); // fold (sint_to_fp c1) -> c1fp if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && // ...but only if the target supports immediate floating-point values (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); // If the input is a legal type, and SINT_TO_FP is not legal on this target, // but UINT_TO_FP is legal on this target, try to convert. if (!TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT) && TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) { // If the sign bit is known to be zero, we can change this to UINT_TO_FP. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); } // The next optimizations are desirable only if SELECT_CC can be lowered. if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) { // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 && !VT.isVector() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { SDLoc DL(N); SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1), DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT), N0.getOperand(2) }; return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops); } // fold (sint_to_fp (zext (setcc x, y, cc))) -> // (select_cc x, y, 1.0, 0.0,, cc) if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { SDLoc DL(N); SDValue Ops[] = { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1), DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT), N0.getOperand(0).getOperand(2) }; return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops); } } if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) return FTrunc; return SDValue(); } SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT OpVT = N0.getValueType(); // fold (uint_to_fp c1) -> c1fp if (DAG.isConstantIntBuildVectorOrConstantInt(N0) && // ...but only if the target supports immediate floating-point values (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0); // If the input is a legal type, and UINT_TO_FP is not legal on this target, // but SINT_TO_FP is legal on this target, try to convert. if (!TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT) && TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) { // If the sign bit is known to be zero, we can change this to SINT_TO_FP. if (DAG.SignBitIsZero(N0)) return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0); } // The next optimizations are desirable only if SELECT_CC can be lowered. if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) { // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc) if (N0.getOpcode() == ISD::SETCC && !VT.isVector() && (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) { SDLoc DL(N); SDValue Ops[] = { N0.getOperand(0), N0.getOperand(1), DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT), N0.getOperand(2) }; return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops); } } if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI)) return FTrunc; return SDValue(); } // Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP) return SDValue(); SDValue Src = N0.getOperand(0); EVT SrcVT = Src.getValueType(); bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP; bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT; // We can safely assume the conversion won't overflow the output range, // because (for example) (uint8_t)18293.f is undefined behavior. // Since we can assume the conversion won't overflow, our decision as to // whether the input will fit in the float should depend on the minimum // of the input range and output range. // This means this is also safe for a signed input and unsigned output, since // a negative input would lead to undefined behavior. unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned; unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned; unsigned ActualSize = std::min(InputSize, OutputSize); const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType()); // We can only fold away the float conversion if the input range can be // represented exactly in the float range. if (APFloat::semanticsPrecision(sem) >= ActualSize) { if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) { unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; return DAG.getNode(ExtOp, SDLoc(N), VT, Src); } if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits()) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src); return DAG.getBitcast(VT, Src); } return SDValue(); } SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (fp_to_sint c1fp) -> c1 if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0); return FoldIntToFPToInt(N, DAG); } SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (fp_to_uint c1fp) -> c1 if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0); return FoldIntToFPToInt(N, DAG); } SDValue DAGCombiner::visitFP_ROUND(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantFPSDNode *N0CFP = dyn_cast(N0); EVT VT = N->getValueType(0); // fold (fp_round c1fp) -> c1fp if (N0CFP) return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1); // fold (fp_round (fp_extend x)) -> x if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType()) return N0.getOperand(0); // fold (fp_round (fp_round x)) -> (fp_round x) if (N0.getOpcode() == ISD::FP_ROUND) { const bool NIsTrunc = N->getConstantOperandVal(1) == 1; const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1; // Skip this folding if it results in an fp_round from f80 to f16. // // f80 to f16 always generates an expensive (and as yet, unimplemented) // libcall to __truncxfhf2 instead of selecting native f16 conversion // instructions from f32 or f64. Moreover, the first (value-preserving) // fp_round from f80 to either f32 or f64 may become a NOP in platforms like // x86. if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16) return SDValue(); // If the first fp_round isn't a value preserving truncation, it might // introduce a tie in the second fp_round, that wouldn't occur in the // single-step fp_round we want to fold to. // In other words, double rounding isn't the same as rounding. // Also, this is a value preserving truncation iff both fp_round's are. if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) { SDLoc DL(N); return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0), DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL)); } } // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y) if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) { SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT, N0.getOperand(0), N1); AddToWorklist(Tmp.getNode()); return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, Tmp, N0.getOperand(1)); } if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; return SDValue(); } SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); EVT EVT = cast(N->getOperand(1))->getVT(); ConstantFPSDNode *N0CFP = dyn_cast(N0); // fold (fp_round_inreg c1fp) -> c1fp if (N0CFP && isTypeLegal(EVT)) { SDLoc DL(N); SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), DL, EVT); return DAG.getNode(ISD::FP_EXTEND, DL, VT, Round); } return SDValue(); } SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded. if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND) return SDValue(); // fold (fp_extend c1fp) -> c1fp if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0); // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op) if (N0.getOpcode() == ISD::FP16_TO_FP && TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal) return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0)); // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the // value of X. if (N0.getOpcode() == ISD::FP_ROUND && N0.getConstantOperandVal(1) == 1) { SDValue In = N0.getOperand(0); if (In.getValueType() == VT) return In; if (VT.bitsLT(In.getValueType())) return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, In, N0.getOperand(1)); return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In); } // fold (fpext (load x)) -> (fpext (fptrunc (extload x))) if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT, LN0->getChain(), LN0->getBasePtr(), N0.getValueType(), LN0->getMemOperand()); CombineTo(N, ExtLoad); CombineTo(N0.getNode(), DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad, DAG.getIntPtrConstant(1, SDLoc(N0))), ExtLoad.getValue(1)); return SDValue(N, 0); // Return N so it doesn't get rechecked! } if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N)) return NewVSel; return SDValue(); } SDValue DAGCombiner::visitFCEIL(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (fceil c1) -> fceil(c1) if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0); return SDValue(); } SDValue DAGCombiner::visitFTRUNC(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ftrunc c1) -> ftrunc(c1) if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0); // fold ftrunc (known rounded int x) -> x // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is // likely to be generated to extract integer from a rounded floating value. switch (N0.getOpcode()) { default: break; case ISD::FRINT: case ISD::FTRUNC: case ISD::FNEARBYINT: case ISD::FFLOOR: case ISD::FCEIL: return N0; } return SDValue(); } SDValue DAGCombiner::visitFFLOOR(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (ffloor c1) -> ffloor(c1) if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0); return SDValue(); } // FIXME: FNEG and FABS have a lot in common; refactor. SDValue DAGCombiner::visitFNEG(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // Constant fold FNEG. if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0); if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(), &DAG.getTarget().Options)) return GetNegatedExpression(N0, DAG, LegalOperations); // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading // constant pool values. if (!TLI.isFNegFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse()) { SDValue Int = N0.getOperand(0); EVT IntVT = Int.getValueType(); if (IntVT.isInteger() && !IntVT.isVector()) { APInt SignMask; if (N0.getValueType().isVector()) { // For a vector, get a mask such as 0x80... per scalar element // and splat it. SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits()); SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); } else { // For a scalar, just generate 0x80... SignMask = APInt::getSignMask(IntVT.getSizeInBits()); } SDLoc DL0(N0); Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int, DAG.getConstant(SignMask, DL0, IntVT)); AddToWorklist(Int.getNode()); return DAG.getBitcast(VT, Int); } } // (fneg (fmul c, x)) -> (fmul -c, x) if (N0.getOpcode() == ISD::FMUL && (N0.getNode()->hasOneUse() || !TLI.isFNegFree(VT))) { ConstantFPSDNode *CFP1 = dyn_cast(N0.getOperand(1)); if (CFP1) { APFloat CVal = CFP1->getValueAPF(); CVal.changeSign(); if (Level >= AfterLegalizeDAG && (TLI.isFPImmLegal(CVal, VT) || TLI.isOperationLegal(ISD::ConstantFP, VT))) return DAG.getNode( ISD::FMUL, SDLoc(N), VT, N0.getOperand(0), DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)), N0->getFlags()); } } return SDValue(); } SDValue DAGCombiner::visitFMINNUM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); if (N0CFP && N1CFP) { const APFloat &C0 = N0CFP->getValueAPF(); const APFloat &C1 = N1CFP->getValueAPF(); return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), VT); } // Canonicalize to constant on RHS. if (isConstantFPBuildVectorOrConstantFP(N0) && !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0); return SDValue(); } SDValue DAGCombiner::visitFMAXNUM(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); if (N0CFP && N1CFP) { const APFloat &C0 = N0CFP->getValueAPF(); const APFloat &C1 = N1CFP->getValueAPF(); return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), VT); } // Canonicalize to constant on RHS. if (isConstantFPBuildVectorOrConstantFP(N0) && !isConstantFPBuildVectorOrConstantFP(N1)) return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0); return SDValue(); } SDValue DAGCombiner::visitFABS(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); // fold (fabs c1) -> fabs(c1) if (isConstantFPBuildVectorOrConstantFP(N0)) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0); // fold (fabs (fabs x)) -> (fabs x) if (N0.getOpcode() == ISD::FABS) return N->getOperand(0); // fold (fabs (fneg x)) -> (fabs x) // fold (fabs (fcopysign x, y)) -> (fabs x) if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN) return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0)); // Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading // constant pool values. if (!TLI.isFAbsFree(VT) && N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse()) { SDValue Int = N0.getOperand(0); EVT IntVT = Int.getValueType(); if (IntVT.isInteger() && !IntVT.isVector()) { APInt SignMask; if (N0.getValueType().isVector()) { // For a vector, get a mask such as 0x7f... per scalar element // and splat it. SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits()); SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); } else { // For a scalar, just generate 0x7f... SignMask = ~APInt::getSignMask(IntVT.getSizeInBits()); } SDLoc DL(N0); Int = DAG.getNode(ISD::AND, DL, IntVT, Int, DAG.getConstant(SignMask, DL, IntVT)); AddToWorklist(Int.getNode()); return DAG.getBitcast(N->getValueType(0), Int); } } return SDValue(); } SDValue DAGCombiner::visitBRCOND(SDNode *N) { SDValue Chain = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); // If N is a constant we could fold this into a fallthrough or unconditional // branch. However that doesn't happen very often in normal code, because // Instcombine/SimplifyCFG should have handled the available opportunities. // If we did this folding here, it would be necessary to update the // MachineBasicBlock CFG, which is awkward. // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal // on the target. if (N1.getOpcode() == ISD::SETCC && TLI.isOperationLegalOrCustom(ISD::BR_CC, N1.getOperand(0).getValueType())) { return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, Chain, N1.getOperand(2), N1.getOperand(0), N1.getOperand(1), N2); } if (N1.hasOneUse()) { if (SDValue NewN1 = rebuildSetCC(N1)) return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, NewN1, N2); } return SDValue(); } SDValue DAGCombiner::rebuildSetCC(SDValue N) { if (N.getOpcode() == ISD::SRL || (N.getOpcode() == ISD::TRUNCATE && (N.getOperand(0).hasOneUse() && N.getOperand(0).getOpcode() == ISD::SRL))) { // Look pass the truncate. if (N.getOpcode() == ISD::TRUNCATE) N = N.getOperand(0); // Match this pattern so that we can generate simpler code: // // %a = ... // %b = and i32 %a, 2 // %c = srl i32 %b, 1 // brcond i32 %c ... // // into // // %a = ... // %b = and i32 %a, 2 // %c = setcc eq %b, 0 // brcond %c ... // // This applies only when the AND constant value has one bit set and the // SRL constant is equal to the log2 of the AND constant. The back-end is // smart enough to convert the result into a TEST/JMP sequence. SDValue Op0 = N.getOperand(0); SDValue Op1 = N.getOperand(1); if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) { SDValue AndOp1 = Op0.getOperand(1); if (AndOp1.getOpcode() == ISD::Constant) { const APInt &AndConst = cast(AndOp1)->getAPIntValue(); if (AndConst.isPowerOf2() && cast(Op1)->getAPIntValue() == AndConst.logBase2()) { SDLoc DL(N); return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()), Op0, DAG.getConstant(0, DL, Op0.getValueType()), ISD::SETNE); } } } } // Transform br(xor(x, y)) -> br(x != y) // Transform br(xor(xor(x,y), 1)) -> br (x == y) if (N.getOpcode() == ISD::XOR) { // Because we may call this on a speculatively constructed // SimplifiedSetCC Node, we need to simplify this node first. // Ideally this should be folded into SimplifySetCC and not // here. For now, grab a handle to N so we don't lose it from // replacements interal to the visit. HandleSDNode XORHandle(N); while (N.getOpcode() == ISD::XOR) { SDValue Tmp = visitXOR(N.getNode()); // No simplification done. if (!Tmp.getNode()) break; // Returning N is form in-visit replacement that may invalidated // N. Grab value from Handle. if (Tmp.getNode() == N.getNode()) N = XORHandle.getValue(); else // Node simplified. Try simplifying again. N = Tmp; } if (N.getOpcode() != ISD::XOR) return N; SDNode *TheXor = N.getNode(); SDValue Op0 = TheXor->getOperand(0); SDValue Op1 = TheXor->getOperand(1); if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) { bool Equal = false; if (isOneConstant(Op0) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR) { TheXor = Op0.getNode(); Equal = true; } EVT SetCCVT = N.getValueType(); if (LegalTypes) SetCCVT = getSetCCResultType(SetCCVT); // Replace the uses of XOR with SETCC return DAG.getSetCC(SDLoc(TheXor), SetCCVT, Op0, Op1, Equal ? ISD::SETEQ : ISD::SETNE); } } return SDValue(); } // Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB. // SDValue DAGCombiner::visitBR_CC(SDNode *N) { CondCodeSDNode *CC = cast(N->getOperand(1)); SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3); // If N is a constant we could fold this into a fallthrough or unconditional // branch. However that doesn't happen very often in normal code, because // Instcombine/SimplifyCFG should have handled the available opportunities. // If we did this folding here, it would be necessary to update the // MachineBasicBlock CFG, which is awkward. // Use SimplifySetCC to simplify SETCC's. SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()), CondLHS, CondRHS, CC->get(), SDLoc(N), false); if (Simp.getNode()) AddToWorklist(Simp.getNode()); // fold to a simpler setcc if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC) return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other, N->getOperand(0), Simp.getOperand(2), Simp.getOperand(0), Simp.getOperand(1), N->getOperand(4)); return SDValue(); } /// Return true if 'Use' is a load or a store that uses N as its base pointer /// and that N may be folded in the load / store addressing mode. static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG, const TargetLowering &TLI) { EVT VT; unsigned AS; if (LoadSDNode *LD = dyn_cast(Use)) { if (LD->isIndexed() || LD->getBasePtr().getNode() != N) return false; VT = LD->getMemoryVT(); AS = LD->getAddressSpace(); } else if (StoreSDNode *ST = dyn_cast(Use)) { if (ST->isIndexed() || ST->getBasePtr().getNode() != N) return false; VT = ST->getMemoryVT(); AS = ST->getAddressSpace(); } else return false; TargetLowering::AddrMode AM; if (N->getOpcode() == ISD::ADD) { ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); if (Offset) // [reg +/- imm] AM.BaseOffs = Offset->getSExtValue(); else // [reg +/- reg] AM.Scale = 1; } else if (N->getOpcode() == ISD::SUB) { ConstantSDNode *Offset = dyn_cast(N->getOperand(1)); if (Offset) // [reg +/- imm] AM.BaseOffs = -Offset->getSExtValue(); else // [reg +/- reg] AM.Scale = 1; } else return false; return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, VT.getTypeForEVT(*DAG.getContext()), AS); } /// Try turning a load/store into a pre-indexed load/store when the base /// pointer is an add or subtract and it has other uses besides the load/store. /// After the transformation, the new indexed load/store has effectively folded /// the add/subtract in and all of its other uses are redirected to the /// new load/store. bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { if (Level < AfterLegalizeDAG) return false; bool isLoad = true; SDValue Ptr; EVT VT; if (LoadSDNode *LD = dyn_cast(N)) { if (LD->isIndexed()) return false; VT = LD->getMemoryVT(); if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) && !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT)) return false; Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { if (ST->isIndexed()) return false; VT = ST->getMemoryVT(); if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) && !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT)) return false; Ptr = ST->getBasePtr(); isLoad = false; } else { return false; } // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail // out. There is no reason to make this a preinc/predec. if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) || Ptr.getNode()->hasOneUse()) return false; // Ask the target to do addressing mode selection. SDValue BasePtr; SDValue Offset; ISD::MemIndexedMode AM = ISD::UNINDEXED; if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG)) return false; // Backends without true r+i pre-indexed forms may need to pass a // constant base with a variable offset so that constant coercion // will work with the patterns in canonical form. bool Swapped = false; if (isa(BasePtr)) { std::swap(BasePtr, Offset); Swapped = true; } // Don't create a indexed load / store with zero offset. if (isNullConstant(Offset)) return false; // Try turning it into a pre-indexed load / store except when: // 1) The new base ptr is a frame index. // 2) If N is a store and the new base ptr is either the same as or is a // predecessor of the value being stored. // 3) Another use of old base ptr is a predecessor of N. If ptr is folded // that would create a cycle. // 4) All uses are load / store ops that use it as old base ptr. // Check #1. Preinc'ing a frame index would require copying the stack pointer // (plus the implicit offset) to a register to preinc anyway. if (isa(BasePtr) || isa(BasePtr)) return false; // Check #2. if (!isLoad) { SDValue Val = cast(N)->getValue(); if (Val == BasePtr || BasePtr.getNode()->isPredecessorOf(Val.getNode())) return false; } // Caches for hasPredecessorHelper. SmallPtrSet Visited; SmallVector Worklist; Worklist.push_back(N); // If the offset is a constant, there may be other adds of constants that // can be folded with this one. We should do this to avoid having to keep // a copy of the original base pointer. SmallVector OtherUses; if (isa(Offset)) for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(), UE = BasePtr.getNode()->use_end(); UI != UE; ++UI) { SDUse &Use = UI.getUse(); // Skip the use that is Ptr and uses of other results from BasePtr's // node (important for nodes that return multiple results). if (Use.getUser() == Ptr.getNode() || Use != BasePtr) continue; if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist)) continue; if (Use.getUser()->getOpcode() != ISD::ADD && Use.getUser()->getOpcode() != ISD::SUB) { OtherUses.clear(); break; } SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1); if (!isa(Op1)) { OtherUses.clear(); break; } // FIXME: In some cases, we can be smarter about this. if (Op1.getValueType() != Offset.getValueType()) { OtherUses.clear(); break; } OtherUses.push_back(Use.getUser()); } if (Swapped) std::swap(BasePtr, Offset); // Now check for #3 and #4. bool RealUse = false; for (SDNode *Use : Ptr.getNode()->uses()) { if (Use == N) continue; if (SDNode::hasPredecessorHelper(Use, Visited, Worklist)) return false; // If Ptr may be folded in addressing mode of other use, then it's // not profitable to do this transformation. if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI)) RealUse = true; } if (!RealUse) return false; SDValue Result; if (isLoad) Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM); else Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM); ++PreIndexedNodes; ++NodesCombined; LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); if (isLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); } // Finally, since the node is now dead, remove it from the graph. deleteAndRecombine(N); if (Swapped) std::swap(BasePtr, Offset); // Replace other uses of BasePtr that can be updated to use Ptr for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) { unsigned OffsetIdx = 1; if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode()) OffsetIdx = 0; assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() == BasePtr.getNode() && "Expected BasePtr operand"); // We need to replace ptr0 in the following expression: // x0 * offset0 + y0 * ptr0 = t0 // knowing that // x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store) // // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the // indexed load/store and the expression that needs to be re-written. // // Therefore, we have: // t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1 ConstantSDNode *CN = cast(OtherUses[i]->getOperand(OffsetIdx)); int X0, X1, Y0, Y1; const APInt &Offset0 = CN->getAPIntValue(); APInt Offset1 = cast(Offset)->getAPIntValue(); X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1; Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1; X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1; Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1; unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD; APInt CNV = Offset0; if (X0 < 0) CNV = -CNV; if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1; else CNV = CNV - Offset1; SDLoc DL(OtherUses[i]); // We can now generate the new expression. SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0)); SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0); SDValue NewUse = DAG.getNode(Opcode, DL, OtherUses[i]->getValueType(0), NewOp1, NewOp2); DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse); deleteAndRecombine(OtherUses[i]); } // Replace the uses of Ptr with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0)); deleteAndRecombine(Ptr.getNode()); AddToWorklist(Result.getNode()); return true; } /// Try to combine a load/store with a add/sub of the base pointer node into a /// post-indexed load/store. The transformation folded the add/subtract into the /// new indexed load/store effectively and all of its uses are redirected to the /// new load/store. bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { if (Level < AfterLegalizeDAG) return false; bool isLoad = true; SDValue Ptr; EVT VT; if (LoadSDNode *LD = dyn_cast(N)) { if (LD->isIndexed()) return false; VT = LD->getMemoryVT(); if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) && !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT)) return false; Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { if (ST->isIndexed()) return false; VT = ST->getMemoryVT(); if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) && !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT)) return false; Ptr = ST->getBasePtr(); isLoad = false; } else { return false; } if (Ptr.getNode()->hasOneUse()) return false; for (SDNode *Op : Ptr.getNode()->uses()) { if (Op == N || (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)) continue; SDValue BasePtr; SDValue Offset; ISD::MemIndexedMode AM = ISD::UNINDEXED; if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) { // Don't create a indexed load / store with zero offset. if (isNullConstant(Offset)) continue; // Try turning it into a post-indexed load / store except when // 1) All uses are load / store ops that use it as base ptr (and // it may be folded as addressing mmode). // 2) Op must be independent of N, i.e. Op is neither a predecessor // nor a successor of N. Otherwise, if Op is folded that would // create a cycle. if (isa(BasePtr) || isa(BasePtr)) continue; // Check for #1. bool TryNext = false; for (SDNode *Use : BasePtr.getNode()->uses()) { if (Use == Ptr.getNode()) continue; // If all the uses are load / store addresses, then don't do the // transformation. if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){ bool RealUse = false; for (SDNode *UseUse : Use->uses()) { if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) RealUse = true; } if (!RealUse) { TryNext = true; break; } } } if (TryNext) continue; // Check for #2 if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) { SDValue Result = isLoad ? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM) : DAG.getIndexedStore(SDValue(N,0), SDLoc(N), BasePtr, Offset, AM); ++PostIndexedNodes; ++NodesCombined; LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); dbgs() << '\n'); WorklistRemover DeadNodes(*this); if (isLoad) { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); } else { DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); } // Finally, since the node is now dead, remove it from the graph. deleteAndRecombine(N); // Replace the uses of Use with uses of the updated base value. DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), Result.getValue(isLoad ? 1 : 0)); deleteAndRecombine(Op); return true; } } } return false; } /// Return the base-pointer arithmetic from an indexed \p LD. SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) { ISD::MemIndexedMode AM = LD->getAddressingMode(); assert(AM != ISD::UNINDEXED); SDValue BP = LD->getOperand(1); SDValue Inc = LD->getOperand(2); // Some backends use TargetConstants for load offsets, but don't expect // TargetConstants in general ADD nodes. We can convert these constants into // regular Constants (if the constant is not opaque). assert((Inc.getOpcode() != ISD::TargetConstant || !cast(Inc)->isOpaque()) && "Cannot split out indexing using opaque target constants"); if (Inc.getOpcode() == ISD::TargetConstant) { ConstantSDNode *ConstInc = cast(Inc); Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc), ConstInc->getValueType(0)); } unsigned Opc = (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB); return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc); } SDValue DAGCombiner::visitLOAD(SDNode *N) { LoadSDNode *LD = cast(N); SDValue Chain = LD->getChain(); SDValue Ptr = LD->getBasePtr(); // If load is not volatile and there are no uses of the loaded value (and // the updated indexed value in case of indexed loads), change uses of the // chain value into uses of the chain input (i.e. delete the dead load). if (!LD->isVolatile()) { if (N->getValueType(1) == MVT::Other) { // Unindexed loads. if (!N->hasAnyUseOfValue(0)) { // It's not safe to use the two value CombineTo variant here. e.g. // v1, chain2 = load chain1, loc // v2, chain3 = load chain2, loc // v3 = add v2, c // Now we replace use of chain2 with chain1. This makes the second load // isomorphic to the one we are deleting, and thus makes this load live. LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG); dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG); dbgs() << "\n"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); AddUsersToWorklist(Chain.getNode()); if (N->use_empty()) deleteAndRecombine(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } else { // Indexed loads. assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?"); // If this load has an opaque TargetConstant offset, then we cannot split // the indexing into an add/sub directly (that TargetConstant may not be // valid for a different type of node, and we cannot convert an opaque // target constant into a regular constant). bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant && cast(LD->getOperand(2))->isOpaque(); if (!N->hasAnyUseOfValue(0) && ((MaySplitLoadIndex && !HasOTCInc) || !N->hasAnyUseOfValue(1))) { SDValue Undef = DAG.getUNDEF(N->getValueType(0)); SDValue Index; if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) { Index = SplitIndexingFromLoad(LD); // Try to fold the base pointer arithmetic into subsequent loads and // stores. AddUsersToWorklist(N); } else Index = DAG.getUNDEF(N->getValueType(1)); LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG); dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG); dbgs() << " and 2 other values\n"); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain); deleteAndRecombine(N); return SDValue(N, 0); // Return N so it doesn't get rechecked! } } } // If this load is directly stored, replace the load value with the stored // value. // TODO: Handle store large -> read small portion. // TODO: Handle TRUNCSTORE/LOADEXT if (OptLevel != CodeGenOpt::None && ISD::isNormalLoad(N) && !LD->isVolatile()) { if (ISD::isNON_TRUNCStore(Chain.getNode())) { StoreSDNode *PrevST = cast(Chain); if (PrevST->getBasePtr() == Ptr && PrevST->getValue().getValueType() == N->getValueType(0)) return CombineTo(N, PrevST->getOperand(1), Chain); } } // Try to infer better alignment information than the load already has. if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) { if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { if (Align > LD->getAlignment() && LD->getSrcValueOffset() % Align == 0) { SDValue NewLoad = DAG.getExtLoad( LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr, LD->getPointerInfo(), LD->getMemoryVT(), Align, LD->getMemOperand()->getFlags(), LD->getAAInfo()); // NewLoad will always be N as we are only refining the alignment assert(NewLoad.getNode() == N); (void)NewLoad; } } } if (LD->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes. SDValue BetterChain = FindBetterChain(N, Chain); // If there is a better chain. if (Chain != BetterChain) { SDValue ReplLoad; // Replace the chain to void dependency. if (LD->getExtensionType() == ISD::NON_EXTLOAD) { ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD), BetterChain, Ptr, LD->getMemOperand()); } else { ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), LD->getValueType(0), BetterChain, Ptr, LD->getMemoryVT(), LD->getMemOperand()); } // Create token factor to keep old chain connected. SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Chain, ReplLoad.getValue(1)); // Replace uses with load result and token factor return CombineTo(N, ReplLoad.getValue(0), Token); } } // Try transforming N to an indexed load. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); // Try to slice up N to more direct loads if the slices are mapped to // different register banks or pairing can take place. if (SliceUpLoad(N)) return SDValue(N, 0); return SDValue(); } namespace { /// Helper structure used to slice a load in smaller loads. /// Basically a slice is obtained from the following sequence: /// Origin = load Ty1, Base /// Shift = srl Ty1 Origin, CstTy Amount /// Inst = trunc Shift to Ty2 /// /// Then, it will be rewritten into: /// Slice = load SliceTy, Base + SliceOffset /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2 /// /// SliceTy is deduced from the number of bits that are actually used to /// build Inst. struct LoadedSlice { /// Helper structure used to compute the cost of a slice. struct Cost { /// Are we optimizing for code size. bool ForCodeSize; /// Various cost. unsigned Loads = 0; unsigned Truncates = 0; unsigned CrossRegisterBanksCopies = 0; unsigned ZExts = 0; unsigned Shift = 0; Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {} /// Get the cost of one isolated slice. Cost(const LoadedSlice &LS, bool ForCodeSize = false) : ForCodeSize(ForCodeSize), Loads(1) { EVT TruncType = LS.Inst->getValueType(0); EVT LoadedType = LS.getLoadedType(); if (TruncType != LoadedType && !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType)) ZExts = 1; } /// Account for slicing gain in the current cost. /// Slicing provide a few gains like removing a shift or a /// truncate. This method allows to grow the cost of the original /// load with the gain from this slice. void addSliceGain(const LoadedSlice &LS) { // Each slice saves a truncate. const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo(); if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(), LS.Inst->getValueType(0))) ++Truncates; // If there is a shift amount, this slice gets rid of it. if (LS.Shift) ++Shift; // If this slice can merge a cross register bank copy, account for it. if (LS.canMergeExpensiveCrossRegisterBankCopy()) ++CrossRegisterBanksCopies; } Cost &operator+=(const Cost &RHS) { Loads += RHS.Loads; Truncates += RHS.Truncates; CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies; ZExts += RHS.ZExts; Shift += RHS.Shift; return *this; } bool operator==(const Cost &RHS) const { return Loads == RHS.Loads && Truncates == RHS.Truncates && CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies && ZExts == RHS.ZExts && Shift == RHS.Shift; } bool operator!=(const Cost &RHS) const { return !(*this == RHS); } bool operator<(const Cost &RHS) const { // Assume cross register banks copies are as expensive as loads. // FIXME: Do we want some more target hooks? unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies; unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies; // Unless we are optimizing for code size, consider the // expensive operation first. if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS) return ExpensiveOpsLHS < ExpensiveOpsRHS; return (Truncates + ZExts + Shift + ExpensiveOpsLHS) < (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS); } bool operator>(const Cost &RHS) const { return RHS < *this; } bool operator<=(const Cost &RHS) const { return !(RHS < *this); } bool operator>=(const Cost &RHS) const { return !(*this < RHS); } }; // The last instruction that represent the slice. This should be a // truncate instruction. SDNode *Inst; // The original load instruction. LoadSDNode *Origin; // The right shift amount in bits from the original load. unsigned Shift; // The DAG from which Origin came from. // This is used to get some contextual information about legal types, etc. SelectionDAG *DAG; LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr, unsigned Shift = 0, SelectionDAG *DAG = nullptr) : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {} /// Get the bits used in a chunk of bits \p BitWidth large. /// \return Result is \p BitWidth and has used bits set to 1 and /// not used bits set to 0. APInt getUsedBits() const { // Reproduce the trunc(lshr) sequence: // - Start from the truncated value. // - Zero extend to the desired bit width. // - Shift left. assert(Origin && "No original load to compare against."); unsigned BitWidth = Origin->getValueSizeInBits(0); assert(Inst && "This slice is not bound to an instruction"); assert(Inst->getValueSizeInBits(0) <= BitWidth && "Extracted slice is bigger than the whole type!"); APInt UsedBits(Inst->getValueSizeInBits(0), 0); UsedBits.setAllBits(); UsedBits = UsedBits.zext(BitWidth); UsedBits <<= Shift; return UsedBits; } /// Get the size of the slice to be loaded in bytes. unsigned getLoadedSize() const { unsigned SliceSize = getUsedBits().countPopulation(); assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte."); return SliceSize / 8; } /// Get the type that will be loaded for this slice. /// Note: This may not be the final type for the slice. EVT getLoadedType() const { assert(DAG && "Missing context"); LLVMContext &Ctxt = *DAG->getContext(); return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8); } /// Get the alignment of the load used for this slice. unsigned getAlignment() const { unsigned Alignment = Origin->getAlignment(); unsigned Offset = getOffsetFromBase(); if (Offset != 0) Alignment = MinAlign(Alignment, Alignment + Offset); return Alignment; } /// Check if this slice can be rewritten with legal operations. bool isLegal() const { // An invalid slice is not legal. if (!Origin || !Inst || !DAG) return false; // Offsets are for indexed load only, we do not handle that. if (!Origin->getOffset().isUndef()) return false; const TargetLowering &TLI = DAG->getTargetLoweringInfo(); // Check that the type is legal. EVT SliceType = getLoadedType(); if (!TLI.isTypeLegal(SliceType)) return false; // Check that the load is legal for this type. if (!TLI.isOperationLegal(ISD::LOAD, SliceType)) return false; // Check that the offset can be computed. // 1. Check its type. EVT PtrType = Origin->getBasePtr().getValueType(); if (PtrType == MVT::Untyped || PtrType.isExtended()) return false; // 2. Check that it fits in the immediate. if (!TLI.isLegalAddImmediate(getOffsetFromBase())) return false; // 3. Check that the computation is legal. if (!TLI.isOperationLegal(ISD::ADD, PtrType)) return false; // Check that the zext is legal if it needs one. EVT TruncateType = Inst->getValueType(0); if (TruncateType != SliceType && !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType)) return false; return true; } /// Get the offset in bytes of this slice in the original chunk of /// bits. /// \pre DAG != nullptr. uint64_t getOffsetFromBase() const { assert(DAG && "Missing context."); bool IsBigEndian = DAG->getDataLayout().isBigEndian(); assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported."); uint64_t Offset = Shift / 8; unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8; assert(!(Origin->getValueSizeInBits(0) & 0x7) && "The size of the original loaded type is not a multiple of a" " byte."); // If Offset is bigger than TySizeInBytes, it means we are loading all // zeros. This should have been optimized before in the process. assert(TySizeInBytes > Offset && "Invalid shift amount for given loaded size"); if (IsBigEndian) Offset = TySizeInBytes - Offset - getLoadedSize(); return Offset; } /// Generate the sequence of instructions to load the slice /// represented by this object and redirect the uses of this slice to /// this new sequence of instructions. /// \pre this->Inst && this->Origin are valid Instructions and this /// object passed the legal check: LoadedSlice::isLegal returned true. /// \return The last instruction of the sequence used to load the slice. SDValue loadSlice() const { assert(Inst && Origin && "Unable to replace a non-existing slice."); const SDValue &OldBaseAddr = Origin->getBasePtr(); SDValue BaseAddr = OldBaseAddr; // Get the offset in that chunk of bytes w.r.t. the endianness. int64_t Offset = static_cast(getOffsetFromBase()); assert(Offset >= 0 && "Offset too big to fit in int64_t!"); if (Offset) { // BaseAddr = BaseAddr + Offset. EVT ArithType = BaseAddr.getValueType(); SDLoc DL(Origin); BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr, DAG->getConstant(Offset, DL, ArithType)); } // Create the type of the loaded slice according to its size. EVT SliceType = getLoadedType(); // Create the load for the slice. SDValue LastInst = DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr, Origin->getPointerInfo().getWithOffset(Offset), getAlignment(), Origin->getMemOperand()->getFlags()); // If the final type is not the same as the loaded type, this means that // we have to pad with zero. Create a zero extend for that. EVT FinalType = Inst->getValueType(0); if (SliceType != FinalType) LastInst = DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst); return LastInst; } /// Check if this slice can be merged with an expensive cross register /// bank copy. E.g., /// i = load i32 /// f = bitcast i32 i to float bool canMergeExpensiveCrossRegisterBankCopy() const { if (!Inst || !Inst->hasOneUse()) return false; SDNode *Use = *Inst->use_begin(); if (Use->getOpcode() != ISD::BITCAST) return false; assert(DAG && "Missing context"); const TargetLowering &TLI = DAG->getTargetLoweringInfo(); EVT ResVT = Use->getValueType(0); const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT()); const TargetRegisterClass *ArgRC = TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT()); if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT)) return false; // At this point, we know that we perform a cross-register-bank copy. // Check if it is expensive. const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo(); // Assume bitcasts are cheap, unless both register classes do not // explicitly share a common sub class. if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC)) return false; // Check if it will be merged with the load. // 1. Check the alignment constraint. unsigned RequiredAlignment = DAG->getDataLayout().getABITypeAlignment( ResVT.getTypeForEVT(*DAG->getContext())); if (RequiredAlignment > getAlignment()) return false; // 2. Check that the load is a legal operation for that type. if (!TLI.isOperationLegal(ISD::LOAD, ResVT)) return false; // 3. Check that we do not have a zext in the way. if (Inst->getValueType(0) != getLoadedType()) return false; return true; } }; } // end anonymous namespace /// Check that all bits set in \p UsedBits form a dense region, i.e., /// \p UsedBits looks like 0..0 1..1 0..0. static bool areUsedBitsDense(const APInt &UsedBits) { // If all the bits are one, this is dense! if (UsedBits.isAllOnesValue()) return true; // Get rid of the unused bits on the right. APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros()); // Get rid of the unused bits on the left. if (NarrowedUsedBits.countLeadingZeros()) NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits()); // Check that the chunk of bits is completely used. return NarrowedUsedBits.isAllOnesValue(); } /// Check whether or not \p First and \p Second are next to each other /// in memory. This means that there is no hole between the bits loaded /// by \p First and the bits loaded by \p Second. static bool areSlicesNextToEachOther(const LoadedSlice &First, const LoadedSlice &Second) { assert(First.Origin == Second.Origin && First.Origin && "Unable to match different memory origins."); APInt UsedBits = First.getUsedBits(); assert((UsedBits & Second.getUsedBits()) == 0 && "Slices are not supposed to overlap."); UsedBits |= Second.getUsedBits(); return areUsedBitsDense(UsedBits); } /// Adjust the \p GlobalLSCost according to the target /// paring capabilities and the layout of the slices. /// \pre \p GlobalLSCost should account for at least as many loads as /// there is in the slices in \p LoadedSlices. static void adjustCostForPairing(SmallVectorImpl &LoadedSlices, LoadedSlice::Cost &GlobalLSCost) { unsigned NumberOfSlices = LoadedSlices.size(); // If there is less than 2 elements, no pairing is possible. if (NumberOfSlices < 2) return; // Sort the slices so that elements that are likely to be next to each // other in memory are next to each other in the list. llvm::sort(LoadedSlices.begin(), LoadedSlices.end(), [](const LoadedSlice &LHS, const LoadedSlice &RHS) { assert(LHS.Origin == RHS.Origin && "Different bases not implemented."); return LHS.getOffsetFromBase() < RHS.getOffsetFromBase(); }); const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo(); // First (resp. Second) is the first (resp. Second) potentially candidate // to be placed in a paired load. const LoadedSlice *First = nullptr; const LoadedSlice *Second = nullptr; for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice, // Set the beginning of the pair. First = Second) { Second = &LoadedSlices[CurrSlice]; // If First is NULL, it means we start a new pair. // Get to the next slice. if (!First) continue; EVT LoadedType = First->getLoadedType(); // If the types of the slices are different, we cannot pair them. if (LoadedType != Second->getLoadedType()) continue; // Check if the target supplies paired loads for this type. unsigned RequiredAlignment = 0; if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) { // move to the next pair, this type is hopeless. Second = nullptr; continue; } // Check if we meet the alignment requirement. if (RequiredAlignment > First->getAlignment()) continue; // Check that both loads are next to each other in memory. if (!areSlicesNextToEachOther(*First, *Second)) continue; assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!"); --GlobalLSCost.Loads; // Move to the next pair. Second = nullptr; } } /// Check the profitability of all involved LoadedSlice. /// Currently, it is considered profitable if there is exactly two /// involved slices (1) which are (2) next to each other in memory, and /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3). /// /// Note: The order of the elements in \p LoadedSlices may be modified, but not /// the elements themselves. /// /// FIXME: When the cost model will be mature enough, we can relax /// constraints (1) and (2). static bool isSlicingProfitable(SmallVectorImpl &LoadedSlices, const APInt &UsedBits, bool ForCodeSize) { unsigned NumberOfSlices = LoadedSlices.size(); if (StressLoadSlicing) return NumberOfSlices > 1; // Check (1). if (NumberOfSlices != 2) return false; // Check (2). if (!areUsedBitsDense(UsedBits)) return false; // Check (3). LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize); // The original code has one big load. OrigCost.Loads = 1; for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) { const LoadedSlice &LS = LoadedSlices[CurrSlice]; // Accumulate the cost of all the slices. LoadedSlice::Cost SliceCost(LS, ForCodeSize); GlobalSlicingCost += SliceCost; // Account as cost in the original configuration the gain obtained // with the current slices. OrigCost.addSliceGain(LS); } // If the target supports paired load, adjust the cost accordingly. adjustCostForPairing(LoadedSlices, GlobalSlicingCost); return OrigCost > GlobalSlicingCost; } /// If the given load, \p LI, is used only by trunc or trunc(lshr) /// operations, split it in the various pieces being extracted. /// /// This sort of thing is introduced by SROA. /// This slicing takes care not to insert overlapping loads. /// \pre LI is a simple load (i.e., not an atomic or volatile load). bool DAGCombiner::SliceUpLoad(SDNode *N) { if (Level < AfterLegalizeDAG) return false; LoadSDNode *LD = cast(N); if (LD->isVolatile() || !ISD::isNormalLoad(LD) || !LD->getValueType(0).isInteger()) return false; // Keep track of already used bits to detect overlapping values. // In that case, we will just abort the transformation. APInt UsedBits(LD->getValueSizeInBits(0), 0); SmallVector LoadedSlices; // Check if this load is used as several smaller chunks of bits. // Basically, look for uses in trunc or trunc(lshr) and record a new chain // of computation for each trunc. for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end(); UI != UIEnd; ++UI) { // Skip the uses of the chain. if (UI.getUse().getResNo() != 0) continue; SDNode *User = *UI; unsigned Shift = 0; // Check if this is a trunc(lshr). if (User->getOpcode() == ISD::SRL && User->hasOneUse() && isa(User->getOperand(1))) { Shift = User->getConstantOperandVal(1); User = *User->use_begin(); } // At this point, User is a Truncate, iff we encountered, trunc or // trunc(lshr). if (User->getOpcode() != ISD::TRUNCATE) return false; // The width of the type must be a power of 2 and greater than 8-bits. // Otherwise the load cannot be represented in LLVM IR. // Moreover, if we shifted with a non-8-bits multiple, the slice // will be across several bytes. We do not support that. unsigned Width = User->getValueSizeInBits(0); if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7)) return false; // Build the slice for this chain of computations. LoadedSlice LS(User, LD, Shift, &DAG); APInt CurrentUsedBits = LS.getUsedBits(); // Check if this slice overlaps with another. if ((CurrentUsedBits & UsedBits) != 0) return false; // Update the bits used globally. UsedBits |= CurrentUsedBits; // Check if the new slice would be legal. if (!LS.isLegal()) return false; // Record the slice. LoadedSlices.push_back(LS); } // Abort slicing if it does not seem to be profitable. if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize)) return false; ++SlicedLoads; // Rewrite each chain to use an independent load. // By construction, each chain can be represented by a unique load. // Prepare the argument for the new token factor for all the slices. SmallVector ArgChains; for (SmallVectorImpl::const_iterator LSIt = LoadedSlices.begin(), LSItEnd = LoadedSlices.end(); LSIt != LSItEnd; ++LSIt) { SDValue SliceInst = LSIt->loadSlice(); CombineTo(LSIt->Inst, SliceInst, true); if (SliceInst.getOpcode() != ISD::LOAD) SliceInst = SliceInst.getOperand(0); assert(SliceInst->getOpcode() == ISD::LOAD && "It takes more than a zext to get to the loaded slice!!"); ArgChains.push_back(SliceInst.getValue(1)); } SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, ArgChains); DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain); AddToWorklist(Chain.getNode()); return true; } /// Check to see if V is (and load (ptr), imm), where the load is having /// specific bytes cleared out. If so, return the byte size being masked out /// and the shift amount. static std::pair CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) { std::pair Result(0, 0); // Check for the structure we're looking for. if (V->getOpcode() != ISD::AND || !isa(V->getOperand(1)) || !ISD::isNormalLoad(V->getOperand(0).getNode())) return Result; // Check the chain and pointer. LoadSDNode *LD = cast(V->getOperand(0)); if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer. // This only handles simple types. if (V.getValueType() != MVT::i16 && V.getValueType() != MVT::i32 && V.getValueType() != MVT::i64) return Result; // Check the constant mask. Invert it so that the bits being masked out are // 0 and the bits being kept are 1. Use getSExtValue so that leading bits // follow the sign bit for uniformity. uint64_t NotMask = ~cast(V->getOperand(1))->getSExtValue(); unsigned NotMaskLZ = countLeadingZeros(NotMask); if (NotMaskLZ & 7) return Result; // Must be multiple of a byte. unsigned NotMaskTZ = countTrailingZeros(NotMask); if (NotMaskTZ & 7) return Result; // Must be multiple of a byte. if (NotMaskLZ == 64) return Result; // All zero mask. // See if we have a continuous run of bits. If so, we have 0*1+0* if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64) return Result; // Adjust NotMaskLZ down to be from the actual size of the int instead of i64. if (V.getValueType() != MVT::i64 && NotMaskLZ) NotMaskLZ -= 64-V.getValueSizeInBits(); unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8; switch (MaskedBytes) { case 1: case 2: case 4: break; default: return Result; // All one mask, or 5-byte mask. } // Verify that the first bit starts at a multiple of mask so that the access // is aligned the same as the access width. if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result; // For narrowing to be valid, it must be the case that the load the // immediately preceeding memory operation before the store. if (LD == Chain.getNode()) ; // ok. else if (Chain->getOpcode() == ISD::TokenFactor && SDValue(LD, 1).hasOneUse()) { // LD has only 1 chain use so they are no indirect dependencies. bool isOk = false; for (const SDValue &ChainOp : Chain->op_values()) if (ChainOp.getNode() == LD) { isOk = true; break; } if (!isOk) return Result; } else return Result; // Fail. Result.first = MaskedBytes; Result.second = NotMaskTZ/8; return Result; } /// Check to see if IVal is something that provides a value as specified by /// MaskInfo. If so, replace the specified store with a narrower store of /// truncated IVal. static SDNode * ShrinkLoadReplaceStoreWithStore(const std::pair &MaskInfo, SDValue IVal, StoreSDNode *St, DAGCombiner *DC) { unsigned NumBytes = MaskInfo.first; unsigned ByteShift = MaskInfo.second; SelectionDAG &DAG = DC->getDAG(); // Check to see if IVal is all zeros in the part being masked in by the 'or' // that uses this. If not, this is not a replacement. APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(), ByteShift*8, (ByteShift+NumBytes)*8); if (!DAG.MaskedValueIsZero(IVal, Mask)) return nullptr; // Check that it is legal on the target to do this. It is legal if the new // VT we're shrinking to (i8/i16/i32) is legal or we're still before type // legalization. MVT VT = MVT::getIntegerVT(NumBytes*8); if (!DC->isTypeLegal(VT)) return nullptr; // Okay, we can do this! Replace the 'St' store with a store of IVal that is // shifted by ByteShift and truncated down to NumBytes. if (ByteShift) { SDLoc DL(IVal); IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal, DAG.getConstant(ByteShift*8, DL, DC->getShiftAmountTy(IVal.getValueType()))); } // Figure out the offset for the store and the alignment of the access. unsigned StOffset; unsigned NewAlign = St->getAlignment(); if (DAG.getDataLayout().isLittleEndian()) StOffset = ByteShift; else StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes; SDValue Ptr = St->getBasePtr(); if (StOffset) { SDLoc DL(IVal); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(StOffset, DL, Ptr.getValueType())); NewAlign = MinAlign(NewAlign, StOffset); } // Truncate down to the new size. IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal); ++OpsNarrowed; return DAG .getStore(St->getChain(), SDLoc(St), IVal, Ptr, St->getPointerInfo().getWithOffset(StOffset), NewAlign) .getNode(); } /// Look for sequence of load / op / store where op is one of 'or', 'xor', and /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try /// narrowing the load and store if it would end up being a win for performance /// or code size. SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { StoreSDNode *ST = cast(N); if (ST->isVolatile()) return SDValue(); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); SDValue Ptr = ST->getBasePtr(); EVT VT = Value.getValueType(); if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse()) return SDValue(); unsigned Opc = Value.getOpcode(); // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst // is a byte mask indicating a consecutive number of bytes, check to see if // Y is known to provide just those bytes. If so, we try to replace the // load + replace + store sequence with a single (narrower) store, which makes // the load dead. if (Opc == ISD::OR) { std::pair MaskedLoad; MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain); if (MaskedLoad.first) if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, Value.getOperand(1), ST,this)) return SDValue(NewST, 0); // Or is commutative, so try swapping X and Y. MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain); if (MaskedLoad.first) if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad, Value.getOperand(0), ST,this)) return SDValue(NewST, 0); } if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) || Value.getOperand(1).getOpcode() != ISD::Constant) return SDValue(); SDValue N0 = Value.getOperand(0); if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && Chain == SDValue(N0.getNode(), 1)) { LoadSDNode *LD = cast(N0); if (LD->getBasePtr() != Ptr || LD->getPointerInfo().getAddrSpace() != ST->getPointerInfo().getAddrSpace()) return SDValue(); // Find the type to narrow it the load / op / store to. SDValue N1 = Value.getOperand(1); unsigned BitWidth = N1.getValueSizeInBits(); APInt Imm = cast(N1)->getAPIntValue(); if (Opc == ISD::AND) Imm ^= APInt::getAllOnesValue(BitWidth); if (Imm == 0 || Imm.isAllOnesValue()) return SDValue(); unsigned ShAmt = Imm.countTrailingZeros(); unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1; unsigned NewBW = NextPowerOf2(MSB - ShAmt); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); // The narrowing should be profitable, the load/store operation should be // legal (or custom) and the store size should be equal to the NewVT width. while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW || !TLI.isOperationLegalOrCustom(Opc, NewVT) || !TLI.isNarrowingProfitable(VT, NewVT))) { NewBW = NextPowerOf2(NewBW); NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); } if (NewBW >= BitWidth) return SDValue(); // If the lsb changed does not start at the type bitwidth boundary, // start at the previous one. if (ShAmt % NewBW) ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW; APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, std::min(BitWidth, ShAmt + NewBW)); if ((Imm & Mask) == Imm) { APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW); if (Opc == ISD::AND) NewImm ^= APInt::getAllOnesValue(NewBW); uint64_t PtrOff = ShAmt / 8; // For big endian targets, we need to adjust the offset to the pointer to // load the correct bytes. if (DAG.getDataLayout().isBigEndian()) PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff; unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff); Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext()); if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy)) return SDValue(); SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LD), Ptr.getValueType(), Ptr, DAG.getConstant(PtrOff, SDLoc(LD), Ptr.getValueType())); SDValue NewLD = DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, LD->getMemOperand()->getFlags(), LD->getAAInfo()); SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD, DAG.getConstant(NewImm, SDLoc(Value), NewVT)); SDValue NewST = DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr, ST->getPointerInfo().getWithOffset(PtrOff), NewAlign); AddToWorklist(NewPtr.getNode()); AddToWorklist(NewLD.getNode()); AddToWorklist(NewVal.getNode()); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); ++OpsNarrowed; return NewST; } } return SDValue(); } /// For a given floating point load / store pair, if the load value isn't used /// by any other operations, then consider transforming the pair to integer /// load / store operations if the target deems the transformation profitable. SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) && Value.hasOneUse() && Chain == SDValue(Value.getNode(), 1)) { LoadSDNode *LD = cast(Value); EVT VT = LD->getMemoryVT(); if (!VT.isFloatingPoint() || VT != ST->getMemoryVT() || LD->isNonTemporal() || ST->isNonTemporal() || LD->getPointerInfo().getAddrSpace() != 0 || ST->getPointerInfo().getAddrSpace() != 0) return SDValue(); EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); if (!TLI.isOperationLegal(ISD::LOAD, IntVT) || !TLI.isOperationLegal(ISD::STORE, IntVT) || !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) || !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT)) return SDValue(); unsigned LDAlign = LD->getAlignment(); unsigned STAlign = ST->getAlignment(); Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext()); unsigned ABIAlign = DAG.getDataLayout().getABITypeAlignment(IntVTTy); if (LDAlign < ABIAlign || STAlign < ABIAlign) return SDValue(); SDValue NewLD = DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LDAlign); SDValue NewST = DAG.getStore(NewLD.getValue(1), SDLoc(N), NewLD, ST->getBasePtr(), ST->getPointerInfo(), STAlign); AddToWorklist(NewLD.getNode()); AddToWorklist(NewST.getNode()); WorklistRemover DeadNodes(*this); DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1)); ++LdStFP2Int; return NewST; } return SDValue(); } // This is a helper function for visitMUL to check the profitability // of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2). // MulNode is the original multiply, AddNode is (add x, c1), // and ConstNode is c2. // // If the (add x, c1) has multiple uses, we could increase // the number of adds if we make this transformation. // It would only be worth doing this if we can remove a // multiply in the process. Check for that here. // To illustrate: // (A + c1) * c3 // (A + c2) * c3 // We're checking for cases where we have common "c3 * A" expressions. bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, SDValue &AddNode, SDValue &ConstNode) { APInt Val; // If the add only has one use, this would be OK to do. if (AddNode.getNode()->hasOneUse()) return true; // Walk all the users of the constant with which we're multiplying. for (SDNode *Use : ConstNode->uses()) { if (Use == MulNode) // This use is the one we're on right now. Skip it. continue; if (Use->getOpcode() == ISD::MUL) { // We have another multiply use. SDNode *OtherOp; SDNode *MulVar = AddNode.getOperand(0).getNode(); // OtherOp is what we're multiplying against the constant. if (Use->getOperand(0) == ConstNode) OtherOp = Use->getOperand(1).getNode(); else OtherOp = Use->getOperand(0).getNode(); // Check to see if multiply is with the same operand of our "add". // // ConstNode = CONST // Use = ConstNode * A <-- visiting Use. OtherOp is A. // ... // AddNode = (A + c1) <-- MulVar is A. // = AddNode * ConstNode <-- current visiting instruction. // // If we make this transformation, we will have a common // multiply (ConstNode * A) that we can save. if (OtherOp == MulVar) return true; // Now check to see if a future expansion will give us a common // multiply. // // ConstNode = CONST // AddNode = (A + c1) // ... = AddNode * ConstNode <-- current visiting instruction. // ... // OtherOp = (A + c2) // Use = OtherOp * ConstNode <-- visiting Use. // // If we make this transformation, we will have a common // multiply (CONST * A) after we also do the same transformation // to the "t2" instruction. if (OtherOp->getOpcode() == ISD::ADD && DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) && OtherOp->getOperand(0).getNode() == MulVar) return true; } } // Didn't find a case where this would be profitable. return false; } SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl &StoreNodes, unsigned NumStores) { SmallVector Chains; SmallPtrSet Visited; SDLoc StoreDL(StoreNodes[0].MemNode); for (unsigned i = 0; i < NumStores; ++i) { Visited.insert(StoreNodes[i].MemNode); } // don't include nodes that are children for (unsigned i = 0; i < NumStores; ++i) { if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0) Chains.push_back(StoreNodes[i].MemNode->getChain()); } assert(Chains.size() > 0 && "Chain should have generated a chain"); return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains); } bool DAGCombiner::MergeStoresOfConstantsOrVecElts( SmallVectorImpl &StoreNodes, EVT MemVT, unsigned NumStores, bool IsConstantSrc, bool UseVector, bool UseTrunc) { // Make sure we have something to merge. if (NumStores < 2) return false; // The latest Node in the DAG. SDLoc DL(StoreNodes[0].MemNode); int64_t ElementSizeBits = MemVT.getStoreSizeInBits(); unsigned SizeInBits = NumStores * ElementSizeBits; unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; EVT StoreTy; if (UseVector) { unsigned Elts = NumStores * NumMemElts; // Get the type for the merged vector store. StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); } else StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits); SDValue StoredVal; if (UseVector) { if (IsConstantSrc) { SmallVector BuildVector; for (unsigned I = 0; I != NumStores; ++I) { StoreSDNode *St = cast(StoreNodes[I].MemNode); SDValue Val = St->getValue(); // If constant is of the wrong type, convert it now. if (MemVT != Val.getValueType()) { Val = peekThroughBitcast(Val); // Deal with constants of wrong size. if (ElementSizeBits != Val.getValueSizeInBits()) { EVT IntMemVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits()); if (isa(Val)) { // Not clear how to truncate FP values. return false; } else if (auto *C = dyn_cast(Val)) Val = DAG.getConstant(C->getAPIntValue() .zextOrTrunc(Val.getValueSizeInBits()) .zextOrTrunc(ElementSizeBits), SDLoc(C), IntMemVT); } // Make sure correctly size type is the correct type. Val = DAG.getBitcast(MemVT, Val); } BuildVector.push_back(Val); } StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, DL, StoreTy, BuildVector); } else { SmallVector Ops; for (unsigned i = 0; i < NumStores; ++i) { StoreSDNode *St = cast(StoreNodes[i].MemNode); SDValue Val = peekThroughBitcast(St->getValue()); // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of // type MemVT. If the underlying value is not the correct // type, but it is an extraction of an appropriate vector we // can recast Val to be of the correct type. This may require // converting between EXTRACT_VECTOR_ELT and // EXTRACT_SUBVECTOR. if ((MemVT != Val.getValueType()) && (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) { SDValue Vec = Val.getOperand(0); EVT MemVTScalarTy = MemVT.getScalarType(); SDValue Idx = Val.getOperand(1); // We may need to add a bitcast here to get types to line up. if (MemVTScalarTy != Vec.getValueType()) { unsigned Elts = Vec.getValueType().getSizeInBits() / MemVTScalarTy.getSizeInBits(); if (Val.getValueType().isVector() && MemVT.isVector()) { unsigned IdxC = cast(Idx)->getZExtValue(); unsigned NewIdx = ((uint64_t)IdxC * MemVT.getVectorNumElements()) / Elts; Idx = DAG.getConstant(NewIdx, SDLoc(Val), Idx.getValueType()); } EVT NewVecTy = EVT::getVectorVT(*DAG.getContext(), MemVTScalarTy, Elts); Vec = DAG.getBitcast(NewVecTy, Vec); } auto OpC = (MemVT.isVector()) ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT; Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Idx); } Ops.push_back(Val); } // Build the extracted vector elements back into a vector. StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, DL, StoreTy, Ops); } } else { // We should always use a vector store when merging extracted vector // elements, so this path implies a store of constants. assert(IsConstantSrc && "Merged vector elements should use vector store"); APInt StoreInt(SizeInBits, 0); // Construct a single integer constant which is made of the smaller // constant inputs. bool IsLE = DAG.getDataLayout().isLittleEndian(); for (unsigned i = 0; i < NumStores; ++i) { unsigned Idx = IsLE ? (NumStores - 1 - i) : i; StoreSDNode *St = cast(StoreNodes[Idx].MemNode); SDValue Val = St->getValue(); Val = peekThroughBitcast(Val); StoreInt <<= ElementSizeBits; if (ConstantSDNode *C = dyn_cast(Val)) { StoreInt |= C->getAPIntValue() .zextOrTrunc(ElementSizeBits) .zextOrTrunc(SizeInBits); } else if (ConstantFPSDNode *C = dyn_cast(Val)) { StoreInt |= C->getValueAPF() .bitcastToAPInt() .zextOrTrunc(ElementSizeBits) .zextOrTrunc(SizeInBits); // If fp truncation is necessary give up for now. if (MemVT.getSizeInBits() != ElementSizeBits) return false; } else { llvm_unreachable("Invalid constant element type"); } } // Create the new Load and Store operations. StoredVal = DAG.getConstant(StoreInt, DL, StoreTy); } LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores); // make sure we use trunc store if it's necessary to be legal. SDValue NewStore; if (!UseTrunc) { NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), FirstInChain->getAlignment()); } else { // Must be realized as a trunc store EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType()); unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits(); ConstantSDNode *C = cast(StoredVal); SDValue ExtendedStoreVal = DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL, LegalizedStoredValTy); NewStore = DAG.getTruncStore( NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/, FirstInChain->getAlignment(), FirstInChain->getMemOperand()->getFlags()); } // Replace all merged stores with the new store. for (unsigned i = 0; i < NumStores; ++i) CombineTo(StoreNodes[i].MemNode, NewStore); AddToWorklist(NewChain.getNode()); return true; } void DAGCombiner::getStoreMergeCandidates( StoreSDNode *St, SmallVectorImpl &StoreNodes, SDNode *&RootNode) { // This holds the base pointer, index, and the offset in bytes from the base // pointer. BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); EVT MemVT = St->getMemoryVT(); SDValue Val = peekThroughBitcast(St->getValue()); // We must have a base and an offset. if (!BasePtr.getBase().getNode()) return; // Do not handle stores to undef base pointers. if (BasePtr.getBase().isUndef()) return; bool IsConstantSrc = isa(Val) || isa(Val); bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT || Val.getOpcode() == ISD::EXTRACT_SUBVECTOR); bool IsLoadSrc = isa(Val); BaseIndexOffset LBasePtr; // Match on loadbaseptr if relevant. EVT LoadVT; if (IsLoadSrc) { auto *Ld = cast(Val); LBasePtr = BaseIndexOffset::match(Ld, DAG); LoadVT = Ld->getMemoryVT(); // Load and store should be the same type. if (MemVT != LoadVT) return; // Loads must only have one use. if (!Ld->hasNUsesOfValue(1, 0)) return; // The memory operands must not be volatile. if (Ld->isVolatile() || Ld->isIndexed()) return; } auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr, int64_t &Offset) -> bool { if (Other->isVolatile() || Other->isIndexed()) return false; SDValue Val = peekThroughBitcast(Other->getValue()); // Allow merging constants of different types as integers. bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT()) : Other->getMemoryVT() != MemVT; if (IsLoadSrc) { if (NoTypeMatch) return false; // The Load's Base Ptr must also match if (LoadSDNode *OtherLd = dyn_cast(Val)) { auto LPtr = BaseIndexOffset::match(OtherLd, DAG); if (LoadVT != OtherLd->getMemoryVT()) return false; // Loads must only have one use. if (!OtherLd->hasNUsesOfValue(1, 0)) return false; // The memory operands must not be volatile. if (OtherLd->isVolatile() || OtherLd->isIndexed()) return false; if (!(LBasePtr.equalBaseIndex(LPtr, DAG))) return false; } else return false; } if (IsConstantSrc) { if (NoTypeMatch) return false; if (!(isa(Val) || isa(Val))) return false; } if (IsExtractVecSrc) { // Do not merge truncated stores here. if (Other->isTruncatingStore()) return false; if (!MemVT.bitsEq(Val.getValueType())) return false; if (Val.getOpcode() != ISD::EXTRACT_VECTOR_ELT && Val.getOpcode() != ISD::EXTRACT_SUBVECTOR) return false; } Ptr = BaseIndexOffset::match(Other, DAG); return (BasePtr.equalBaseIndex(Ptr, DAG, Offset)); }; // We looking for a root node which is an ancestor to all mergable // stores. We search up through a load, to our root and then down // through all children. For instance we will find Store{1,2,3} if // St is Store1, Store2. or Store3 where the root is not a load // which always true for nonvolatile ops. TODO: Expand // the search to find all valid candidates through multiple layers of loads. // // Root // |-------|-------| // Load Load Store3 // | | // Store1 Store2 // // FIXME: We should be able to climb and // descend TokenFactors to find candidates as well. RootNode = St->getChain().getNode(); if (LoadSDNode *Ldn = dyn_cast(RootNode)) { RootNode = Ldn->getChain().getNode(); for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) if (I.getOperandNo() == 0 && isa(*I)) // walk down chain for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2) if (I2.getOperandNo() == 0) if (StoreSDNode *OtherST = dyn_cast(*I2)) { BaseIndexOffset Ptr; int64_t PtrDiff; if (CandidateMatch(OtherST, Ptr, PtrDiff)) StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); } } else for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I) if (I.getOperandNo() == 0) if (StoreSDNode *OtherST = dyn_cast(*I)) { BaseIndexOffset Ptr; int64_t PtrDiff; if (CandidateMatch(OtherST, Ptr, PtrDiff)) StoreNodes.push_back(MemOpLink(OtherST, PtrDiff)); } } // We need to check that merging these stores does not cause a loop in // the DAG. Any store candidate may depend on another candidate // indirectly through its operand (we already consider dependencies // through the chain). Check in parallel by searching up from // non-chain operands of candidates. bool DAGCombiner::checkMergeStoreCandidatesForDependencies( SmallVectorImpl &StoreNodes, unsigned NumStores, SDNode *RootNode) { // FIXME: We should be able to truncate a full search of // predecessors by doing a BFS and keeping tabs the originating // stores from which worklist nodes come from in a similar way to // TokenFactor simplfication. SmallPtrSet Visited; SmallVector Worklist; // RootNode is a predecessor to all candidates so we need not search // past it. Add RootNode (peeking through TokenFactors). Do not count // these towards size check. Worklist.push_back(RootNode); while (!Worklist.empty()) { auto N = Worklist.pop_back_val(); if (N->getOpcode() == ISD::TokenFactor) { for (SDValue Op : N->ops()) Worklist.push_back(Op.getNode()); } Visited.insert(N); } // Don't count pruning nodes towards max. unsigned int Max = 1024 + Visited.size(); // Search Ops of store candidates. for (unsigned i = 0; i < NumStores; ++i) { SDNode *N = StoreNodes[i].MemNode; // Of the 4 Store Operands: // * Chain (Op 0) -> We have already considered these // in candidate selection and can be // safely ignored // * Value (Op 1) -> Cycles may happen (e.g. through load chains) // * Address (Op 2) -> Merged addresses may only vary by a fixed constant // and so no cycles are possible. // * (Op 3) -> appears to always be undef. Cannot be source of cycle. // // Thus we need only check predecessors of the value operands. auto *Op = N->getOperand(1).getNode(); if (Visited.insert(Op).second) Worklist.push_back(Op); } // Search through DAG. We can stop early if we find a store node. for (unsigned i = 0; i < NumStores; ++i) if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist, Max)) return false; return true; } bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) { if (OptLevel == CodeGenOpt::None) return false; EVT MemVT = St->getMemoryVT(); int64_t ElementSizeBytes = MemVT.getStoreSize(); unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1; if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits) return false; bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); // This function cannot currently deal with non-byte-sized memory sizes. if (ElementSizeBytes * 8 != MemVT.getSizeInBits()) return false; if (!MemVT.isSimple()) return false; // Perform an early exit check. Do not bother looking at stored values that // are not constants, loads, or extracted vector elements. SDValue StoredVal = peekThroughBitcast(St->getValue()); bool IsLoadSrc = isa(StoredVal); bool IsConstantSrc = isa(StoredVal) || isa(StoredVal); bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT || StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR); if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc) return false; SmallVector StoreNodes; SDNode *RootNode; // Find potential store merge candidates by searching through chain sub-DAG getStoreMergeCandidates(St, StoreNodes, RootNode); // Check if there is anything to merge. if (StoreNodes.size() < 2) return false; // Sort the memory operands according to their distance from the // base pointer. llvm::sort(StoreNodes.begin(), StoreNodes.end(), [](MemOpLink LHS, MemOpLink RHS) { return LHS.OffsetFromBase < RHS.OffsetFromBase; }); // Store Merge attempts to merge the lowest stores. This generally // works out as if successful, as the remaining stores are checked // after the first collection of stores is merged. However, in the // case that a non-mergeable store is found first, e.g., {p[-2], // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent // mergeable cases. To prevent this, we prune such stores from the // front of StoreNodes here. bool RV = false; while (StoreNodes.size() > 1) { unsigned StartIdx = 0; while ((StartIdx + 1 < StoreNodes.size()) && StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes != StoreNodes[StartIdx + 1].OffsetFromBase) ++StartIdx; // Bail if we don't have enough candidates to merge. if (StartIdx + 1 >= StoreNodes.size()) return RV; if (StartIdx) StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx); // Scan the memory operations on the chain and find the first // non-consecutive store memory address. unsigned NumConsecutiveStores = 1; int64_t StartAddress = StoreNodes[0].OffsetFromBase; // Check that the addresses are consecutive starting from the second // element in the list of stores. for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) { int64_t CurrAddress = StoreNodes[i].OffsetFromBase; if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; NumConsecutiveStores = i + 1; } if (NumConsecutiveStores < 2) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumConsecutiveStores); continue; } // The node with the lowest store address. LLVMContext &Context = *DAG.getContext(); const DataLayout &DL = DAG.getDataLayout(); // Store the constants into memory as one consecutive store. if (IsConstantSrc) { while (NumConsecutiveStores >= 2) { LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); unsigned FirstStoreAlign = FirstInChain->getAlignment(); unsigned LastLegalType = 1; unsigned LastLegalVectorType = 1; bool LastIntegerTrunc = false; bool NonZero = false; unsigned FirstZeroAfterNonZero = NumConsecutiveStores; for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *ST = cast(StoreNodes[i].MemNode); SDValue StoredVal = ST->getValue(); bool IsElementZero = false; if (ConstantSDNode *C = dyn_cast(StoredVal)) IsElementZero = C->isNullValue(); else if (ConstantFPSDNode *C = dyn_cast(StoredVal)) IsElementZero = C->getConstantFPValue()->isNullValue(); if (IsElementZero) { if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores) FirstZeroAfterNonZero = i; } NonZero |= !IsElementZero; // Find a legal type for the constant store. unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits); bool IsFast = false; // Break early when size is too large to be legal. if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits) break; if (TLI.isTypeLegal(StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) { LastIntegerTrunc = false; LastLegalType = i + 1; // Or check whether a truncstore is legal. } else if (TLI.getTypeAction(Context, StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoredVal.getValueType()); if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) { LastIntegerTrunc = true; LastLegalType = i + 1; } } // We only use vectors if the constant is known to be zero or the // target allows it and the function is not marked with the // noimplicitfloat attribute. if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) && !NoVectors) { // Find a legal type for the vector store. unsigned Elts = (i + 1) * NumMemElts; EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) LastLegalVectorType = i + 1; } } bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors; unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType; // Check if we found a legal integer type that creates a meaningful // merge. if (NumElem < 2) { // We know that candidate stores are in order and of correct // shape. While there is no mergeable sequence from the // beginning one may start later in the sequence. The only // reason a merge of size N could have failed where another of // the same size would not have, is if the alignment has // improved or we've dropped a non-zero value. Drop as many // candidates as we can here. unsigned NumSkip = 1; while ( (NumSkip < NumConsecutiveStores) && (NumSkip < FirstZeroAfterNonZero) && (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) NumSkip++; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); NumConsecutiveStores -= NumSkip; continue; } // Check that we can merge these candidates without causing a cycle. if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem, RootNode)) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); NumConsecutiveStores -= NumElem; continue; } RV |= MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc); // Remove merged stores for next iteration. StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); NumConsecutiveStores -= NumElem; } continue; } // When extracting multiple vector elements, try to store them // in one vector store rather than a sequence of scalar stores. if (IsExtractVecSrc) { // Loop on Consecutive Stores on success. while (NumConsecutiveStores >= 2) { LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); unsigned FirstStoreAlign = FirstInChain->getAlignment(); unsigned NumStoresToMerge = 1; for (unsigned i = 0; i < NumConsecutiveStores; ++i) { // Find a legal type for the vector store. unsigned Elts = (i + 1) * NumMemElts; EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts); bool IsFast; // Break early when size is too large to be legal. if (Ty.getSizeInBits() > MaximumLegalStoreInBits) break; if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) && TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS, FirstStoreAlign, &IsFast) && IsFast) NumStoresToMerge = i + 1; } // Check if we found a legal integer type creating a meaningful // merge. if (NumStoresToMerge < 2) { // We know that candidate stores are in order and of correct // shape. While there is no mergeable sequence from the // beginning one may start later in the sequence. The only // reason a merge of size N could have failed where another of // the same size would not have, is if the alignment has // improved. Drop as many candidates as we can here. unsigned NumSkip = 1; while ( (NumSkip < NumConsecutiveStores) && (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) NumSkip++; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); NumConsecutiveStores -= NumSkip; continue; } // Check that we can merge these candidates without causing a cycle. if (!checkMergeStoreCandidatesForDependencies( StoreNodes, NumStoresToMerge, RootNode)) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge); NumConsecutiveStores -= NumStoresToMerge; continue; } RV |= MergeStoresOfConstantsOrVecElts( StoreNodes, MemVT, NumStoresToMerge, false, true, false); StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge); NumConsecutiveStores -= NumStoresToMerge; } continue; } // Below we handle the case of multiple consecutive stores that // come from multiple consecutive loads. We merge them into a single // wide load and a single wide store. // Look for load nodes which are used by the stored values. SmallVector LoadNodes; // Find acceptable loads. Loads need to have the same chain (token factor), // must not be zext, volatile, indexed, and they must be consecutive. BaseIndexOffset LdBasePtr; for (unsigned i = 0; i < NumConsecutiveStores; ++i) { StoreSDNode *St = cast(StoreNodes[i].MemNode); SDValue Val = peekThroughBitcast(St->getValue()); LoadSDNode *Ld = cast(Val); BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG); // If this is not the first ptr that we check. int64_t LdOffset = 0; if (LdBasePtr.getBase().getNode()) { // The base ptr must be the same. if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset)) break; } else { // Check that all other base pointers are the same as this one. LdBasePtr = LdPtr; } // We found a potential memory operand to merge. LoadNodes.push_back(MemOpLink(Ld, LdOffset)); } while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) { // If we have load/store pair instructions and we only have two values, // don't bother merging. unsigned RequiredAlignment; if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) && StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2); LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2); break; } LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode; unsigned FirstStoreAS = FirstInChain->getAddressSpace(); unsigned FirstStoreAlign = FirstInChain->getAlignment(); LoadSDNode *FirstLoad = cast(LoadNodes[0].MemNode); unsigned FirstLoadAS = FirstLoad->getAddressSpace(); unsigned FirstLoadAlign = FirstLoad->getAlignment(); // Scan the memory operations on the chain and find the first // non-consecutive load memory address. These variables hold the index in // the store node array. unsigned LastConsecutiveLoad = 1; // This variable refers to the size and not index in the array. unsigned LastLegalVectorType = 1; unsigned LastLegalIntegerType = 1; bool isDereferenceable = true; bool DoIntegerTruncate = false; StartAddress = LoadNodes[0].OffsetFromBase; SDValue FirstChain = FirstLoad->getChain(); for (unsigned i = 1; i < LoadNodes.size(); ++i) { // All loads must share the same chain. if (LoadNodes[i].MemNode->getChain() != FirstChain) break; int64_t CurrAddress = LoadNodes[i].OffsetFromBase; if (CurrAddress - StartAddress != (ElementSizeBytes * i)) break; LastConsecutiveLoad = i; if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable()) isDereferenceable = false; // Find a legal type for the vector store. unsigned Elts = (i + 1) * NumMemElts; EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); // Break early when size is too large to be legal. if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits) break; bool IsFastSt, IsFastLd; if (TLI.isTypeLegal(StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, FirstLoadAlign, &IsFastLd) && IsFastLd) { LastLegalVectorType = i + 1; } // Find a legal type for the integer store. unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8; StoreTy = EVT::getIntegerVT(Context, SizeInBits); if (TLI.isTypeLegal(StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, FirstLoadAlign, &IsFastLd) && IsFastLd) { LastLegalIntegerType = i + 1; DoIntegerTruncate = false; // Or check whether a truncstore and extload is legal. } else if (TLI.getTypeAction(Context, StoreTy) == TargetLowering::TypePromoteInteger) { EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy); if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) && TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) && TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) && TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) && TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS, FirstStoreAlign, &IsFastSt) && IsFastSt && TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS, FirstLoadAlign, &IsFastLd) && IsFastLd) { LastLegalIntegerType = i + 1; DoIntegerTruncate = true; } } } // Only use vector types if the vector type is larger than the integer // type. If they are the same, use integers. bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors; unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType); // We add +1 here because the LastXXX variables refer to location while // the NumElem refers to array/index size. unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1); NumElem = std::min(LastLegalType, NumElem); if (NumElem < 2) { // We know that candidate stores are in order and of correct // shape. While there is no mergeable sequence from the // beginning one may start later in the sequence. The only // reason a merge of size N could have failed where another of // the same size would not have is if the alignment or either // the load or store has improved. Drop as many candidates as we // can here. unsigned NumSkip = 1; while ((NumSkip < LoadNodes.size()) && (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) && (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign)) NumSkip++; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip); LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip); NumConsecutiveStores -= NumSkip; continue; } // Check that we can merge these candidates without causing a cycle. if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem, RootNode)) { StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem); NumConsecutiveStores -= NumElem; continue; } // Find if it is better to use vectors or integers to load and store // to memory. EVT JointMemOpVT; if (UseVectorTy) { // Find a legal type for the vector store. unsigned Elts = NumElem * NumMemElts; JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts); } else { unsigned SizeInBits = NumElem * ElementSizeBytes * 8; JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits); } SDLoc LoadDL(LoadNodes[0].MemNode); SDLoc StoreDL(StoreNodes[0].MemNode); // The merged loads are required to have the same incoming chain, so // using the first's chain is acceptable. SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem); AddToWorklist(NewStoreChain.getNode()); MachineMemOperand::Flags MMOFlags = isDereferenceable ? MachineMemOperand::MODereferenceable : MachineMemOperand::MONone; SDValue NewLoad, NewStore; if (UseVectorTy || !DoIntegerTruncate) { NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), FirstLoadAlign, MMOFlags); NewStore = DAG.getStore( NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), FirstStoreAlign); } else { // This must be the truncstore/extload case EVT ExtendedTy = TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT); NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy, FirstLoad->getChain(), FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), JointMemOpVT, FirstLoadAlign, MMOFlags); NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(), FirstInChain->getPointerInfo(), JointMemOpVT, FirstInChain->getAlignment(), FirstInChain->getMemOperand()->getFlags()); } // Transfer chain users from old loads to the new load. for (unsigned i = 0; i < NumElem; ++i) { LoadSDNode *Ld = cast(LoadNodes[i].MemNode); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1)); } // Replace the all stores with the new store. Recursively remove // corresponding value if its no longer used. for (unsigned i = 0; i < NumElem; ++i) { SDValue Val = StoreNodes[i].MemNode->getOperand(1); CombineTo(StoreNodes[i].MemNode, NewStore); if (Val.getNode()->use_empty()) recursivelyDeleteUnusedNodes(Val.getNode()); } RV = true; StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem); LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem); NumConsecutiveStores -= NumElem; } } return RV; } SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) { SDLoc SL(ST); SDValue ReplStore; // Replace the chain to avoid dependency. if (ST->isTruncatingStore()) { ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(), ST->getMemoryVT(), ST->getMemOperand()); } else { ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(), ST->getMemOperand()); } // Create token to keep both nodes around. SDValue Token = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, ST->getChain(), ReplStore); // Make sure the new and old chains are cleaned up. AddToWorklist(Token.getNode()); // Don't add users to work list. return CombineTo(ST, Token, false); } SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) { SDValue Value = ST->getValue(); if (Value.getOpcode() == ISD::TargetConstantFP) return SDValue(); SDLoc DL(ST); SDValue Chain = ST->getChain(); SDValue Ptr = ST->getBasePtr(); const ConstantFPSDNode *CFP = cast(Value); // NOTE: If the original store is volatile, this transform must not increase // the number of stores. For example, on x86-32 an f64 can be stored in one // processor operation but an i64 (which is not legal) requires two. So the // transform should not be done in this case. SDValue Tmp; switch (CFP->getSimpleValueType(0).SimpleTy) { default: llvm_unreachable("Unknown FP type"); case MVT::f16: // We don't do this for these yet. case MVT::f80: case MVT::f128: case MVT::ppcf128: return SDValue(); case MVT::f32: if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { ; Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF(). bitcastToAPInt().getZExtValue(), SDLoc(CFP), MVT::i32); return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand()); } return SDValue(); case MVT::f64: if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) { ; Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt(). getZExtValue(), SDLoc(CFP), MVT::i64); return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand()); } if (!ST->isVolatile() && TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) { // Many FP stores are not made apparent until after legalize, e.g. for // argument passing. Since this is so common, custom legalize the // 64-bit integer store into two 32-bit stores. uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue(); SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32); SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32); if (DAG.getDataLayout().isBigEndian()) std::swap(Lo, Hi); unsigned Alignment = ST->getAlignment(); MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); AAMDNodes AAInfo = ST->getAAInfo(); SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), ST->getAlignment(), MMOFlags, AAInfo); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(4, DL, Ptr.getValueType())); Alignment = MinAlign(Alignment, 4U); SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(4), Alignment, MMOFlags, AAInfo); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1); } return SDValue(); } } SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast(N); SDValue Chain = ST->getChain(); SDValue Value = ST->getValue(); SDValue Ptr = ST->getBasePtr(); // If this is a store of a bit convert, store the input value if the // resultant store does not need a higher alignment than the original. if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() && ST->isUnindexed()) { EVT SVT = Value.getOperand(0).getValueType(); if (((!LegalOperations && !ST->isVolatile()) || TLI.isOperationLegalOrCustom(ISD::STORE, SVT)) && TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) { unsigned OrigAlign = ST->getAlignment(); bool Fast = false; if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT, ST->getAddressSpace(), OrigAlign, &Fast) && Fast) { return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, ST->getPointerInfo(), OrigAlign, ST->getMemOperand()->getFlags(), ST->getAAInfo()); } } } // Turn 'store undef, Ptr' -> nothing. if (Value.isUndef() && ST->isUnindexed()) return Chain; // Try to infer better alignment information than the store already has. if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) { if (unsigned Align = DAG.InferPtrAlignment(Ptr)) { if (Align > ST->getAlignment() && ST->getSrcValueOffset() % Align == 0) { SDValue NewStore = DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(), ST->getMemoryVT(), Align, ST->getMemOperand()->getFlags(), ST->getAAInfo()); // NewStore will always be N as we are only refining the alignment assert(NewStore.getNode() == N); (void)NewStore; } } } // Try transforming a pair floating point load / store ops to integer // load / store ops. if (SDValue NewST = TransformFPLoadStorePair(N)) return NewST; if (ST->isUnindexed()) { // Walk up chain skipping non-aliasing memory nodes, on this store and any // adjacent stores. if (findBetterNeighborChains(ST)) { // replaceStoreChain uses CombineTo, which handled all of the worklist // manipulation. Return the original node to not do anything else. return SDValue(ST, 0); } Chain = ST->getChain(); } // FIXME: is there such a thing as a truncating indexed store? if (ST->isTruncatingStore() && ST->isUnindexed() && Value.getValueType().isInteger()) { // See if we can simplify the input to this truncstore with knowledge that // only the low bits are being used. For example: // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8" SDValue Shorter = DAG.GetDemandedBits( Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), ST->getMemoryVT().getScalarSizeInBits())); AddToWorklist(Value.getNode()); if (Shorter.getNode()) return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(), ST->getMemOperand()); // Otherwise, see if we can simplify the operation with // SimplifyDemandedBits, which only works if the value has a single use. if (SimplifyDemandedBits( Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(), ST->getMemoryVT().getScalarSizeInBits()))) { // Re-visit the store if anything changed and the store hasn't been merged // with another node (N is deleted) SimplifyDemandedBits will add Value's // node back to the worklist if necessary, but we also need to re-visit // the Store node itself. if (N->getOpcode() != ISD::DELETED_NODE) AddToWorklist(N); return SDValue(N, 0); } } // If this is a load followed by a store to the same location, then the store // is dead/noop. if (LoadSDNode *Ld = dyn_cast(Value)) { if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() && ST->isUnindexed() && !ST->isVolatile() && // There can't be any side effects between the load and store, such as // a call or store. Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) { // The store is dead, remove it. return Chain; } } if (StoreSDNode *ST1 = dyn_cast(Chain)) { if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() && !ST1->isVolatile() && ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT()) { // If this is a store followed by a store with the same value to the same // location, then the store is dead/noop. if (ST1->getValue() == Value) { // The store is dead, remove it. return Chain; } // If this is a store who's preceeding store to the same location // and no one other node is chained to that store we can effectively // drop the store. Do not remove stores to undef as they may be used as // data sinks. if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() && !ST1->getBasePtr().isUndef()) { // ST1 is fully overwritten and can be elided. Combine with it's chain // value. CombineTo(ST1, ST1->getChain()); return SDValue(); } } } // If this is an FP_ROUND or TRUNC followed by a store, fold this into a // truncating store. We can do this even if this is already a truncstore. if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE) && Value.getNode()->hasOneUse() && ST->isUnindexed() && TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), ST->getMemoryVT())) { return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, ST->getMemoryVT(), ST->getMemOperand()); } // Always perform this optimization before types are legal. If the target // prefers, also try this after legalization to catch stores that were created // by intrinsics or other nodes. if (!LegalTypes || (TLI.mergeStoresAfterLegalization())) { while (true) { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so // or until we merge the last store on the chain. bool Changed = MergeConsecutiveStores(ST); if (!Changed) break; // Return N as merge only uses CombineTo and no worklist clean // up is necessary. if (N->getOpcode() == ISD::DELETED_NODE || !isa(N)) return SDValue(N, 0); } } // Try transforming N to an indexed store. if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N)) return SDValue(N, 0); // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr' // // Make sure to do this only after attempting to merge stores in order to // avoid changing the types of some subset of stores due to visit order, // preventing their merging. if (isa(ST->getValue())) { if (SDValue NewSt = replaceStoreOfFPConstant(ST)) return NewSt; } if (SDValue NewSt = splitMergedValStore(ST)) return NewSt; return ReduceLoadOpStoreWidth(N); } /// For the instruction sequence of store below, F and I values /// are bundled together as an i64 value before being stored into memory. /// Sometimes it is more efficent to generate separate stores for F and I, /// which can remove the bitwise instructions or sink them to colder places. /// /// (store (or (zext (bitcast F to i32) to i64), /// (shl (zext I to i64), 32)), addr) --> /// (store F, addr) and (store I, addr+4) /// /// Similarly, splitting for other merged store can also be beneficial, like: /// For pair of {i32, i32}, i64 store --> two i32 stores. /// For pair of {i32, i16}, i64 store --> two i32 stores. /// For pair of {i16, i16}, i32 store --> two i16 stores. /// For pair of {i16, i8}, i32 store --> two i16 stores. /// For pair of {i8, i8}, i16 store --> two i8 stores. /// /// We allow each target to determine specifically which kind of splitting is /// supported. /// /// The store patterns are commonly seen from the simple code snippet below /// if only std::make_pair(...) is sroa transformed before inlined into hoo. /// void goo(const std::pair &); /// hoo() { /// ... /// goo(std::make_pair(tmp, ftmp)); /// ... /// } /// SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) { if (OptLevel == CodeGenOpt::None) return SDValue(); SDValue Val = ST->getValue(); SDLoc DL(ST); // Match OR operand. if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR) return SDValue(); // Match SHL operand and get Lower and Higher parts of Val. SDValue Op1 = Val.getOperand(0); SDValue Op2 = Val.getOperand(1); SDValue Lo, Hi; if (Op1.getOpcode() != ISD::SHL) { std::swap(Op1, Op2); if (Op1.getOpcode() != ISD::SHL) return SDValue(); } Lo = Op2; Hi = Op1.getOperand(0); if (!Op1.hasOneUse()) return SDValue(); // Match shift amount to HalfValBitSize. unsigned HalfValBitSize = Val.getValueSizeInBits() / 2; ConstantSDNode *ShAmt = dyn_cast(Op1.getOperand(1)); if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize) return SDValue(); // Lo and Hi are zero-extended from int with size less equal than 32 // to i64. if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() || !Lo.getOperand(0).getValueType().isScalarInteger() || Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize || Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() || !Hi.getOperand(0).getValueType().isScalarInteger() || Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize) return SDValue(); // Use the EVT of low and high parts before bitcast as the input // of target query. EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST) ? Lo.getOperand(0).getValueType() : Lo.getValueType(); EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST) ? Hi.getOperand(0).getValueType() : Hi.getValueType(); if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy)) return SDValue(); // Start to split store. unsigned Alignment = ST->getAlignment(); MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags(); AAMDNodes AAInfo = ST->getAAInfo(); // Change the sizes of Lo and Hi's value types to HalfValBitSize. EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize); Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0)); Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0)); SDValue Chain = ST->getChain(); SDValue Ptr = ST->getBasePtr(); // Lower value store. SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(), ST->getAlignment(), MMOFlags, AAInfo); Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType())); // Higher value store. SDValue St1 = DAG.getStore(St0, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(HalfValBitSize / 8), Alignment / 2, MMOFlags, AAInfo); return St1; } /// Convert a disguised subvector insertion into a shuffle: /// insert_vector_elt V, (bitcast X from vector type), IdxC --> /// bitcast(shuffle (bitcast V), (extended X), Mask) /// Note: We do not use an insert_subvector node because that requires a legal /// subvector type. SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) { SDValue InsertVal = N->getOperand(1); if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() || !InsertVal.getOperand(0).getValueType().isVector()) return SDValue(); SDValue SubVec = InsertVal.getOperand(0); SDValue DestVec = N->getOperand(0); EVT SubVecVT = SubVec.getValueType(); EVT VT = DestVec.getValueType(); unsigned NumSrcElts = SubVecVT.getVectorNumElements(); unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits(); unsigned NumMaskVals = ExtendRatio * NumSrcElts; // Step 1: Create a shuffle mask that implements this insert operation. The // vector that we are inserting into will be operand 0 of the shuffle, so // those elements are just 'i'. The inserted subvector is in the first // positions of operand 1 of the shuffle. Example: // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7} SmallVector Mask(NumMaskVals); for (unsigned i = 0; i != NumMaskVals; ++i) { if (i / NumSrcElts == InsIndex) Mask[i] = (i % NumSrcElts) + NumMaskVals; else Mask[i] = i; } // Bail out if the target can not handle the shuffle we want to create. EVT SubVecEltVT = SubVecVT.getVectorElementType(); EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals); if (!TLI.isShuffleMaskLegal(Mask, ShufVT)) return SDValue(); // Step 2: Create a wide vector from the inserted source vector by appending // undefined elements. This is the same size as our destination vector. SDLoc DL(N); SmallVector ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT)); ConcatOps[0] = SubVec; SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps); // Step 3: Shuffle in the padded subvector. SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec); SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask); AddToWorklist(PaddedSubV.getNode()); AddToWorklist(DestVecBC.getNode()); AddToWorklist(Shuf.getNode()); return DAG.getBitcast(VT, Shuf); } SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { SDValue InVec = N->getOperand(0); SDValue InVal = N->getOperand(1); SDValue EltNo = N->getOperand(2); SDLoc DL(N); // If the inserted element is an UNDEF, just use the input vector. if (InVal.isUndef()) return InVec; EVT VT = InVec.getValueType(); // Remove redundant insertions: // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT && InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1)) return InVec; // We must know which element is being inserted for folds below here. auto *IndexC = dyn_cast(EltNo); if (!IndexC) return SDValue(); unsigned Elt = IndexC->getZExtValue(); if (SDValue Shuf = combineInsertEltToShuffle(N, Elt)) return Shuf; // Canonicalize insert_vector_elt dag nodes. // Example: // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1) // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0) // // Do this only if the child insert_vector node has one use; also // do this only if indices are both constants and Idx1 < Idx0. if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse() && isa(InVec.getOperand(2))) { unsigned OtherElt = InVec.getConstantOperandVal(2); if (Elt < OtherElt) { // Swap nodes. SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, InVec.getOperand(0), InVal, EltNo); AddToWorklist(NewOp.getNode()); return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()), VT, NewOp, InVec.getOperand(1), InVec.getOperand(2)); } } // If we can't generate a legal BUILD_VECTOR, exit if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) return SDValue(); // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially // be converted to a BUILD_VECTOR). Fill in the Ops vector with the // vector elements. SmallVector Ops; // Do not combine these two vectors if the output vector will not replace // the input vector. if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) { Ops.append(InVec.getNode()->op_begin(), InVec.getNode()->op_end()); } else if (InVec.isUndef()) { unsigned NElts = VT.getVectorNumElements(); Ops.append(NElts, DAG.getUNDEF(InVal.getValueType())); } else { return SDValue(); } // Insert the element if (Elt < Ops.size()) { // All the operands of BUILD_VECTOR must have the same type; // we enforce that here. EVT OpVT = Ops[0].getValueType(); Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal; } // Return the new vector return DAG.getBuildVector(VT, DL, Ops); } SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad( SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) { assert(!OriginalLoad->isVolatile()); EVT ResultVT = EVE->getValueType(0); EVT VecEltVT = InVecVT.getVectorElementType(); unsigned Align = OriginalLoad->getAlignment(); unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment( VecEltVT.getTypeForEVT(*DAG.getContext())); if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT)) return SDValue(); ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD; if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT)) return SDValue(); Align = NewAlign; SDValue NewPtr = OriginalLoad->getBasePtr(); SDValue Offset; EVT PtrType = NewPtr.getValueType(); MachinePointerInfo MPI; SDLoc DL(EVE); if (auto *ConstEltNo = dyn_cast(EltNo)) { int Elt = ConstEltNo->getZExtValue(); unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8; Offset = DAG.getConstant(PtrOff, DL, PtrType); MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff); } else { Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType); Offset = DAG.getNode( ISD::MUL, DL, PtrType, Offset, DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType)); MPI = OriginalLoad->getPointerInfo(); } NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, NewPtr, Offset); // The replacement we need to do here is a little tricky: we need to // replace an extractelement of a load with a load. // Use ReplaceAllUsesOfValuesWith to do the replacement. // Note that this replacement assumes that the extractvalue is the only // use of the load; that's okay because we don't want to perform this // transformation in other cases anyway. SDValue Load; SDValue Chain; if (ResultVT.bitsGT(VecEltVT)) { // If the result type of vextract is wider than the load, then issue an // extending load instead. ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) ? ISD::ZEXTLOAD : ISD::EXTLOAD; Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT, OriginalLoad->getChain(), NewPtr, MPI, VecEltVT, Align, OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo()); Chain = Load.getValue(1); } else { Load = DAG.getLoad(VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, Align, OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo()); Chain = Load.getValue(1); if (ResultVT.bitsLT(VecEltVT)) Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load); else Load = DAG.getBitcast(ResultVT, Load); } WorklistRemover DeadNodes(*this); SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) }; SDValue To[] = { Load, Chain }; DAG.ReplaceAllUsesOfValuesWith(From, To, 2); // Since we're explicitly calling ReplaceAllUses, add the new node to the // worklist explicitly as well. AddToWorklist(Load.getNode()); AddUsersToWorklist(Load.getNode()); // Add users too // Make sure to revisit this node to clean it up; it will usually be dead. AddToWorklist(EVE); ++OpsNarrowed; return SDValue(EVE, 0); } SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) { // (vextract (scalar_to_vector val, 0) -> val SDValue InVec = N->getOperand(0); EVT VT = InVec.getValueType(); EVT NVT = N->getValueType(0); if (InVec.isUndef()) return DAG.getUNDEF(NVT); if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) { // Check if the result type doesn't match the inserted element type. A // SCALAR_TO_VECTOR may truncate the inserted element and the // EXTRACT_VECTOR_ELT may widen the extracted vector. SDValue InOp = InVec.getOperand(0); if (InOp.getValueType() != NVT) { assert(InOp.getValueType().isInteger() && NVT.isInteger()); return DAG.getSExtOrTrunc(InOp, SDLoc(InVec), NVT); } return InOp; } SDValue EltNo = N->getOperand(1); ConstantSDNode *ConstEltNo = dyn_cast(EltNo); // extract_vector_elt of out-of-bounds element -> UNDEF if (ConstEltNo && ConstEltNo->getAPIntValue().uge(VT.getVectorNumElements())) return DAG.getUNDEF(NVT); // extract_vector_elt (build_vector x, y), 1 -> y if (ConstEltNo && InVec.getOpcode() == ISD::BUILD_VECTOR && TLI.isTypeLegal(VT) && (InVec.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VT))) { SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue()); EVT InEltVT = Elt.getValueType(); // Sometimes build_vector's scalar input types do not match result type. if (NVT == InEltVT) return Elt; // TODO: It may be useful to truncate if free if the build_vector implicitly // converts. } // extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x) bool isLE = DAG.getDataLayout().isLittleEndian(); unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1; if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() && ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) { SDValue BCSrc = InVec.getOperand(0); if (BCSrc.getValueType().isScalarInteger()) return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc); } // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val // // This only really matters if the index is non-constant since other combines // on the constant elements already work. if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && EltNo == InVec.getOperand(2)) { SDValue Elt = InVec.getOperand(1); return VT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, SDLoc(N), NVT) : Elt; } // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT. // We only perform this optimization before the op legalization phase because // we may introduce new vector instructions which are not backed by TD // patterns. For example on AVX, extracting elements from a wide vector // without using extract_subvector. However, if we can find an underlying // scalar value, then we can always use that. if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) { int NumElem = VT.getVectorNumElements(); ShuffleVectorSDNode *SVOp = cast(InVec); // Find the new index to extract from. int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue()); // Extracting an undef index is undef. if (OrigElt == -1) return DAG.getUNDEF(NVT); // Select the right vector half to extract from. SDValue SVInVec; if (OrigElt < NumElem) { SVInVec = InVec->getOperand(0); } else { SVInVec = InVec->getOperand(1); OrigElt -= NumElem; } if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) { SDValue InOp = SVInVec.getOperand(OrigElt); if (InOp.getValueType() != NVT) { assert(InOp.getValueType().isInteger() && NVT.isInteger()); InOp = DAG.getSExtOrTrunc(InOp, SDLoc(SVInVec), NVT); } return InOp; } // FIXME: We should handle recursing on other vector shuffles and // scalar_to_vector here as well. if (!LegalOperations || // FIXME: Should really be just isOperationLegalOrCustom. TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VT) || TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VT)) { EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout()); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec, DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy)); } } // If only EXTRACT_VECTOR_ELT nodes use the source vector we can // simplify it based on the (valid) extraction indices. if (llvm::all_of(InVec->uses(), [&](SDNode *Use) { return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && Use->getOperand(0) == InVec && isa(Use->getOperand(1)); })) { APInt DemandedElts = APInt::getNullValue(VT.getVectorNumElements()); for (SDNode *Use : InVec->uses()) { auto *CstElt = cast(Use->getOperand(1)); if (CstElt->getAPIntValue().ult(VT.getVectorNumElements())) DemandedElts.setBit(CstElt->getZExtValue()); } if (SimplifyDemandedVectorElts(InVec, DemandedElts, true)) return SDValue(N, 0); } bool BCNumEltsChanged = false; EVT ExtVT = VT.getVectorElementType(); EVT LVT = ExtVT; // If the result of load has to be truncated, then it's not necessarily // profitable. if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT)) return SDValue(); if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); EVT BCVT = InVec.getOperand(0).getValueType(); if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType())) return SDValue(); if (VT.getVectorNumElements() != BCVT.getVectorNumElements()) BCNumEltsChanged = true; InVec = InVec.getOperand(0); ExtVT = BCVT.getVectorElementType(); } // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size) if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() && ISD::isNormalLoad(InVec.getNode()) && !N->getOperand(1)->hasPredecessor(InVec.getNode())) { SDValue Index = N->getOperand(1); if (LoadSDNode *OrigLoad = dyn_cast(InVec)) { if (!OrigLoad->isVolatile()) { return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index, OrigLoad); } } } // Perform only after legalization to ensure build_vector / vector_shuffle // optimizations have already been done. if (!LegalOperations) return SDValue(); // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size) // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size) // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr) if (ConstEltNo) { int Elt = cast(EltNo)->getZExtValue(); LoadSDNode *LN0 = nullptr; const ShuffleVectorSDNode *SVN = nullptr; if (ISD::isNormalLoad(InVec.getNode())) { LN0 = cast(InVec); } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR && InVec.getOperand(0).getValueType() == ExtVT && ISD::isNormalLoad(InVec.getOperand(0).getNode())) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); LN0 = cast(InVec.getOperand(0)); } else if ((SVN = dyn_cast(InVec))) { // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1) // => // (load $addr+1*size) // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); // If the bit convert changed the number of elements, it is unsafe // to examine the mask. if (BCNumEltsChanged) return SDValue(); // Select the input vector, guarding against out of range extract vector. unsigned NumElems = VT.getVectorNumElements(); int Idx = (Elt > (int)NumElems) ? -1 : SVN->getMaskElt(Elt); InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1); if (InVec.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. if (!InVec.hasOneUse()) return SDValue(); InVec = InVec.getOperand(0); } if (ISD::isNormalLoad(InVec.getNode())) { LN0 = cast(InVec); Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems; EltNo = DAG.getConstant(Elt, SDLoc(EltNo), EltNo.getValueType()); } } // Make sure we found a non-volatile load and the extractelement is // the only use. if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile()) return SDValue(); // If Idx was -1 above, Elt is going to be -1, so just return undef. if (Elt == -1) return DAG.getUNDEF(LVT); return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0); } return SDValue(); } // Simplify (build_vec (ext )) to (bitcast (build_vec )) SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) { // We perform this optimization post type-legalization because // the type-legalizer often scalarizes integer-promoted vectors. // Performing this optimization before may create bit-casts which // will be type-legalized to complex code sequences. // We perform this optimization only before the operation legalizer because we // may introduce illegal operations. if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes) return SDValue(); unsigned NumInScalars = N->getNumOperands(); SDLoc DL(N); EVT VT = N->getValueType(0); // Check to see if this is a BUILD_VECTOR of a bunch of values // which come from any_extend or zero_extend nodes. If so, we can create // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR // optimizations. We do not handle sign-extend because we can't fill the sign // using shuffles. EVT SourceType = MVT::Other; bool AllAnyExt = true; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); // Ignore undef inputs. if (In.isUndef()) continue; bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND; bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND; // Abort if the element is not an extension. if (!ZeroExt && !AnyExt) { SourceType = MVT::Other; break; } // The input is a ZeroExt or AnyExt. Check the original type. EVT InTy = In.getOperand(0).getValueType(); // Check that all of the widened source types are the same. if (SourceType == MVT::Other) // First time. SourceType = InTy; else if (InTy != SourceType) { // Multiple income types. Abort. SourceType = MVT::Other; break; } // Check if all of the extends are ANY_EXTENDs. AllAnyExt &= AnyExt; } // In order to have valid types, all of the inputs must be extended from the // same source type and all of the inputs must be any or zero extend. // Scalar sizes must be a power of two. EVT OutScalarTy = VT.getScalarType(); bool ValidTypes = SourceType != MVT::Other && isPowerOf2_32(OutScalarTy.getSizeInBits()) && isPowerOf2_32(SourceType.getSizeInBits()); // Create a new simpler BUILD_VECTOR sequence which other optimizations can // turn into a single shuffle instruction. if (!ValidTypes) return SDValue(); bool isLE = DAG.getDataLayout().isLittleEndian(); unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits(); assert(ElemRatio > 1 && "Invalid element size ratio"); SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType): DAG.getConstant(0, DL, SourceType); unsigned NewBVElems = ElemRatio * VT.getVectorNumElements(); SmallVector Ops(NewBVElems, Filler); // Populate the new build_vector for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDValue Cast = N->getOperand(i); assert((Cast.getOpcode() == ISD::ANY_EXTEND || Cast.getOpcode() == ISD::ZERO_EXTEND || Cast.isUndef()) && "Invalid cast opcode"); SDValue In; if (Cast.isUndef()) In = DAG.getUNDEF(SourceType); else In = Cast->getOperand(0); unsigned Index = isLE ? (i * ElemRatio) : (i * ElemRatio + (ElemRatio - 1)); assert(Index < Ops.size() && "Invalid index"); Ops[Index] = In; } // The type of the new BUILD_VECTOR node. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems); assert(VecVT.getSizeInBits() == VT.getSizeInBits() && "Invalid vector size"); // Check if the new vector type is legal. if (!isTypeLegal(VecVT) || (!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) && TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))) return SDValue(); // Make the new BUILD_VECTOR. SDValue BV = DAG.getBuildVector(VecVT, DL, Ops); // The new BUILD_VECTOR node has the potential to be further optimized. AddToWorklist(BV.getNode()); // Bitcast to the desired type. return DAG.getBitcast(VT, BV); } SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumInScalars = N->getNumOperands(); SDLoc DL(N); EVT SrcVT = MVT::Other; unsigned Opcode = ISD::DELETED_NODE; unsigned NumDefs = 0; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); unsigned Opc = In.getOpcode(); if (Opc == ISD::UNDEF) continue; // If all scalar values are floats and converted from integers. if (Opcode == ISD::DELETED_NODE && (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) { Opcode = Opc; } if (Opc != Opcode) return SDValue(); EVT InVT = In.getOperand(0).getValueType(); // If all scalar values are typed differently, bail out. It's chosen to // simplify BUILD_VECTOR of integer types. if (SrcVT == MVT::Other) SrcVT = InVT; if (SrcVT != InVT) return SDValue(); NumDefs++; } // If the vector has just one element defined, it's not worth to fold it into // a vectorized one. if (NumDefs < 2) return SDValue(); assert((Opcode == ISD::UINT_TO_FP || Opcode == ISD::SINT_TO_FP) && "Should only handle conversion from integer to float."); assert(SrcVT != MVT::Other && "Cannot determine source type!"); EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars); if (!TLI.isOperationLegalOrCustom(Opcode, NVT)) return SDValue(); // Just because the floating-point vector type is legal does not necessarily // mean that the corresponding integer vector type is. if (!isTypeLegal(NVT)) return SDValue(); SmallVector Opnds; for (unsigned i = 0; i != NumInScalars; ++i) { SDValue In = N->getOperand(i); if (In.isUndef()) Opnds.push_back(DAG.getUNDEF(SrcVT)); else Opnds.push_back(In.getOperand(0)); } SDValue BV = DAG.getBuildVector(NVT, DL, Opnds); AddToWorklist(BV.getNode()); return DAG.getNode(Opcode, DL, VT, BV); } SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N, ArrayRef VectorMask, SDValue VecIn1, SDValue VecIn2, unsigned LeftIdx) { MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy); EVT VT = N->getValueType(0); EVT InVT1 = VecIn1.getValueType(); EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1; unsigned Vec2Offset = 0; unsigned NumElems = VT.getVectorNumElements(); unsigned ShuffleNumElems = NumElems; // In case both the input vectors are extracted from same base // vector we do not need extra addend (Vec2Offset) while // computing shuffle mask. if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) || !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) || !(VecIn1.getOperand(0) == VecIn2.getOperand(0))) Vec2Offset = InVT1.getVectorNumElements(); // We can't generate a shuffle node with mismatched input and output types. // Try to make the types match the type of the output. if (InVT1 != VT || InVT2 != VT) { if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) { // If the output vector length is a multiple of both input lengths, // we can concatenate them and pad the rest with undefs. unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits(); assert(NumConcats >= 2 && "Concat needs at least two inputs!"); SmallVector ConcatOps(NumConcats, DAG.getUNDEF(InVT1)); ConcatOps[0] = VecIn1; ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1); VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); VecIn2 = SDValue(); } else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) { if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems)) return SDValue(); if (!VecIn2.getNode()) { // If we only have one input vector, and it's twice the size of the // output, split it in two. VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, DAG.getConstant(NumElems, DL, IdxTy)); VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx); // Since we now have shorter input vectors, adjust the offset of the // second vector's start. Vec2Offset = NumElems; } else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) { // VecIn1 is wider than the output, and we have another, possibly // smaller input. Pad the smaller input with undefs, shuffle at the // input vector width, and extract the output. // The shuffle type is different than VT, so check legality again. if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1)) return SDValue(); // Legalizing INSERT_SUBVECTOR is tricky - you basically have to // lower it back into a BUILD_VECTOR. So if the inserted type is // illegal, don't even try. if (InVT1 != InVT2) { if (!TLI.isTypeLegal(InVT2)) return SDValue(); VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1, DAG.getUNDEF(InVT1), VecIn2, ZeroIdx); } ShuffleNumElems = NumElems * 2; } else { // Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider // than VecIn1. We can't handle this for now - this case will disappear // when we start sorting the vectors by type. return SDValue(); } } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() && InVT1.getSizeInBits() == VT.getSizeInBits()) { SmallVector ConcatOps(2, DAG.getUNDEF(InVT2)); ConcatOps[0] = VecIn2; VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps); } else { // TODO: Support cases where the length mismatch isn't exactly by a // factor of 2. // TODO: Move this check upwards, so that if we have bad type // mismatches, we don't create any DAG nodes. return SDValue(); } } // Initialize mask to undef. SmallVector Mask(ShuffleNumElems, -1); // Only need to run up to the number of elements actually used, not the // total number of elements in the shuffle - if we are shuffling a wider // vector, the high lanes should be set to undef. for (unsigned i = 0; i != NumElems; ++i) { if (VectorMask[i] <= 0) continue; unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1); if (VectorMask[i] == (int)LeftIdx) { Mask[i] = ExtIndex; } else if (VectorMask[i] == (int)LeftIdx + 1) { Mask[i] = Vec2Offset + ExtIndex; } } // The type the input vectors may have changed above. InVT1 = VecIn1.getValueType(); // If we already have a VecIn2, it should have the same type as VecIn1. // If we don't, get an undef/zero vector of the appropriate type. VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1); assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type."); SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask); if (ShuffleNumElems > NumElems) Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx); return Shuffle; } // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT // operations. If the types of the vectors we're extracting from allow it, // turn this into a vector_shuffle node. SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { SDLoc DL(N); EVT VT = N->getValueType(0); // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes. if (!isTypeLegal(VT)) return SDValue(); // May only combine to shuffle after legalize if shuffle is legal. if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT)) return SDValue(); bool UsesZeroVector = false; unsigned NumElems = N->getNumOperands(); // Record, for each element of the newly built vector, which input vector // that element comes from. -1 stands for undef, 0 for the zero vector, // and positive values for the input vectors. // VectorMask maps each element to its vector number, and VecIn maps vector // numbers to their initial SDValues. SmallVector VectorMask(NumElems, -1); SmallVector VecIn; VecIn.push_back(SDValue()); for (unsigned i = 0; i != NumElems; ++i) { SDValue Op = N->getOperand(i); if (Op.isUndef()) continue; // See if we can use a blend with a zero vector. // TODO: Should we generalize this to a blend with an arbitrary constant // vector? if (isNullConstant(Op) || isNullFPConstant(Op)) { UsesZeroVector = true; VectorMask[i] = 0; continue; } // Not an undef or zero. If the input is something other than an // EXTRACT_VECTOR_ELT with an in-range constant index, bail out. if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(Op.getOperand(1))) return SDValue(); SDValue ExtractedFromVec = Op.getOperand(0); APInt ExtractIdx = cast(Op.getOperand(1))->getAPIntValue(); if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements())) return SDValue(); // All inputs must have the same element type as the output. if (VT.getVectorElementType() != ExtractedFromVec.getValueType().getVectorElementType()) return SDValue(); // Have we seen this input vector before? // The vectors are expected to be tiny (usually 1 or 2 elements), so using // a map back from SDValues to numbers isn't worth it. unsigned Idx = std::distance( VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec)); if (Idx == VecIn.size()) VecIn.push_back(ExtractedFromVec); VectorMask[i] = Idx; } // If we didn't find at least one input vector, bail out. if (VecIn.size() < 2) return SDValue(); // If all the Operands of BUILD_VECTOR extract from same // vector, then split the vector efficiently based on the maximum // vector access index and adjust the VectorMask and // VecIn accordingly. if (VecIn.size() == 2) { unsigned MaxIndex = 0; unsigned NearestPow2 = 0; SDValue Vec = VecIn.back(); EVT InVT = Vec.getValueType(); MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); SmallVector IndexVec(NumElems, 0); for (unsigned i = 0; i < NumElems; i++) { if (VectorMask[i] <= 0) continue; unsigned Index = N->getOperand(i).getConstantOperandVal(1); IndexVec[i] = Index; MaxIndex = std::max(MaxIndex, Index); } NearestPow2 = PowerOf2Ceil(MaxIndex); if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 && NumElems * 2 < NearestPow2) { unsigned SplitSize = NearestPow2 / 2; EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(), SplitSize); if (TLI.isTypeLegal(SplitVT)) { SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, DAG.getConstant(SplitSize, DL, IdxTy)); SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec, DAG.getConstant(0, DL, IdxTy)); VecIn.pop_back(); VecIn.push_back(VecIn1); VecIn.push_back(VecIn2); for (unsigned i = 0; i < NumElems; i++) { if (VectorMask[i] <= 0) continue; VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2; } } } } // TODO: We want to sort the vectors by descending length, so that adjacent // pairs have similar length, and the longer vector is always first in the // pair. // TODO: Should this fire if some of the input vectors has illegal type (like // it does now), or should we let legalization run its course first? // Shuffle phase: // Take pairs of vectors, and shuffle them so that the result has elements // from these vectors in the correct places. // For example, given: // t10: i32 = extract_vector_elt t1, Constant:i64<0> // t11: i32 = extract_vector_elt t2, Constant:i64<0> // t12: i32 = extract_vector_elt t3, Constant:i64<0> // t13: i32 = extract_vector_elt t1, Constant:i64<1> // t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13 // We will generate: // t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2 // t21: v4i32 = vector_shuffle t3, undef SmallVector Shuffles; for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) { unsigned LeftIdx = 2 * In + 1; SDValue VecLeft = VecIn[LeftIdx]; SDValue VecRight = (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue(); if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft, VecRight, LeftIdx)) Shuffles.push_back(Shuffle); else return SDValue(); } // If we need the zero vector as an "ingredient" in the blend tree, add it // to the list of shuffles. if (UsesZeroVector) Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT) : DAG.getConstantFP(0.0, DL, VT)); // If we only have one shuffle, we're done. if (Shuffles.size() == 1) return Shuffles[0]; // Update the vector mask to point to the post-shuffle vectors. for (int &Vec : VectorMask) if (Vec == 0) Vec = Shuffles.size() - 1; else Vec = (Vec - 1) / 2; // More than one shuffle. Generate a binary tree of blends, e.g. if from // the previous step we got the set of shuffles t10, t11, t12, t13, we will // generate: // t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2 // t11: v8i32 = vector_shuffle t3, t4 // t12: v8i32 = vector_shuffle t5, t6 // t13: v8i32 = vector_shuffle t7, t8 // t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11 // t21: v8i32 = vector_shuffle t12, t13 // t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21 // Make sure the initial size of the shuffle list is even. if (Shuffles.size() % 2) Shuffles.push_back(DAG.getUNDEF(VT)); for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) { if (CurSize % 2) { Shuffles[CurSize] = DAG.getUNDEF(VT); CurSize++; } for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) { int Left = 2 * In; int Right = 2 * In + 1; SmallVector Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { if (VectorMask[i] == Left) { Mask[i] = i; VectorMask[i] = In; } else if (VectorMask[i] == Right) { Mask[i] = i + NumElems; VectorMask[i] = In; } } Shuffles[In] = DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask); } } return Shuffles[0]; } // Try to turn a build vector of zero extends of extract vector elts into a // a vector zero extend and possibly an extract subvector. // TODO: Support sign extend or any extend? // TODO: Allow undef elements? // TODO: Don't require the extracts to start at element 0. SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) { if (LegalOperations) return SDValue(); EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); auto checkElem = [&](SDValue Op) -> int64_t { if (Op.getOpcode() == ISD::ZERO_EXTEND && Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0)) if (auto *C = dyn_cast(Op.getOperand(0).getOperand(1))) return C->getZExtValue(); return -1; }; // Make sure the first element matches // (zext (extract_vector_elt X, C)) int64_t Offset = checkElem(Op0); if (Offset < 0) return SDValue(); unsigned NumElems = N->getNumOperands(); SDValue In = Op0.getOperand(0).getOperand(0); EVT InSVT = In.getValueType().getScalarType(); EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems); // Don't create an illegal input type after type legalization. if (LegalTypes && !TLI.isTypeLegal(InVT)) return SDValue(); // Ensure all the elements come from the same vector and are adjacent. for (unsigned i = 1; i != NumElems; ++i) { if ((Offset + i) != checkElem(N->getOperand(i))) return SDValue(); } SDLoc DL(N); In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In, Op0.getOperand(0).getOperand(1)); return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, In); } SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) { EVT VT = N->getValueType(0); // A vector built entirely of undefs is undef. if (ISD::allOperandsUndef(N)) return DAG.getUNDEF(VT); // If this is a splat of a bitcast from another vector, change to a // concat_vector. // For example: // (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) -> // (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X)))) // // If X is a build_vector itself, the concat can become a larger build_vector. // TODO: Maybe this is useful for non-splat too? if (!LegalOperations) { if (SDValue Splat = cast(N)->getSplatValue()) { Splat = peekThroughBitcast(Splat); EVT SrcVT = Splat.getValueType(); if (SrcVT.isVector()) { unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements(); EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), NumElts); if (!LegalTypes || TLI.isTypeLegal(NewVT)) { SmallVector Ops(N->getNumOperands(), Splat); SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), NewVT, Ops); return DAG.getBitcast(VT, Concat); } } } } // Check if we can express BUILD VECTOR via subvector extract. if (!LegalTypes && (N->getNumOperands() > 1)) { SDValue Op0 = N->getOperand(0); auto checkElem = [&](SDValue Op) -> uint64_t { if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) && (Op0.getOperand(0) == Op.getOperand(0))) if (auto CNode = dyn_cast(Op.getOperand(1))) return CNode->getZExtValue(); return -1; }; int Offset = checkElem(Op0); for (unsigned i = 0; i < N->getNumOperands(); ++i) { if (Offset + i != checkElem(N->getOperand(i))) { Offset = -1; break; } } if ((Offset == 0) && (Op0.getOperand(0).getValueType() == N->getValueType(0))) return Op0.getOperand(0); if ((Offset != -1) && ((Offset % N->getValueType(0).getVectorNumElements()) == 0)) // IDX must be multiple of output size. return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0), Op0.getOperand(0), Op0.getOperand(1)); } if (SDValue V = convertBuildVecZextToZext(N)) return V; if (SDValue V = reduceBuildVecExtToExtBuildVec(N)) return V; if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N)) return V; if (SDValue V = reduceBuildVecToShuffle(N)) return V; return SDValue(); } static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT OpVT = N->getOperand(0).getValueType(); // If the operands are legal vectors, leave them alone. if (TLI.isTypeLegal(OpVT)) return SDValue(); SDLoc DL(N); EVT VT = N->getValueType(0); SmallVector Ops; EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits()); SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); // Keep track of what we encounter. bool AnyInteger = false; bool AnyFP = false; for (const SDValue &Op : N->ops()) { if (ISD::BITCAST == Op.getOpcode() && !Op.getOperand(0).getValueType().isVector()) Ops.push_back(Op.getOperand(0)); else if (ISD::UNDEF == Op.getOpcode()) Ops.push_back(ScalarUndef); else return SDValue(); // Note whether we encounter an integer or floating point scalar. // If it's neither, bail out, it could be something weird like x86mmx. EVT LastOpVT = Ops.back().getValueType(); if (LastOpVT.isFloatingPoint()) AnyFP = true; else if (LastOpVT.isInteger()) AnyInteger = true; else return SDValue(); } // If any of the operands is a floating point scalar bitcast to a vector, // use floating point types throughout, and bitcast everything. // Replace UNDEFs by another scalar UNDEF node, of the final desired type. if (AnyFP) { SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits()); ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT); if (AnyInteger) { for (SDValue &Op : Ops) { if (Op.getValueType() == SVT) continue; if (Op.isUndef()) Op = ScalarUndef; else Op = DAG.getBitcast(SVT, Op); } } } EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT, VT.getSizeInBits() / SVT.getSizeInBits()); return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops)); } // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at // most two distinct vectors the same size as the result, attempt to turn this // into a legal shuffle. static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); EVT OpVT = N->getOperand(0).getValueType(); int NumElts = VT.getVectorNumElements(); int NumOpElts = OpVT.getVectorNumElements(); SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT); SmallVector Mask; for (SDValue Op : N->ops()) { // Peek through any bitcast. Op = peekThroughBitcast(Op); // UNDEF nodes convert to UNDEF shuffle mask values. if (Op.isUndef()) { Mask.append((unsigned)NumOpElts, -1); continue; } if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) return SDValue(); // What vector are we extracting the subvector from and at what index? SDValue ExtVec = Op.getOperand(0); // We want the EVT of the original extraction to correctly scale the // extraction index. EVT ExtVT = ExtVec.getValueType(); // Peek through any bitcast. ExtVec = peekThroughBitcast(ExtVec); // UNDEF nodes convert to UNDEF shuffle mask values. if (ExtVec.isUndef()) { Mask.append((unsigned)NumOpElts, -1); continue; } if (!isa(Op.getOperand(1))) return SDValue(); int ExtIdx = Op.getConstantOperandVal(1); // Ensure that we are extracting a subvector from a vector the same // size as the result. if (ExtVT.getSizeInBits() != VT.getSizeInBits()) return SDValue(); // Scale the subvector index to account for any bitcast. int NumExtElts = ExtVT.getVectorNumElements(); if (0 == (NumExtElts % NumElts)) ExtIdx /= (NumExtElts / NumElts); else if (0 == (NumElts % NumExtElts)) ExtIdx *= (NumElts / NumExtElts); else return SDValue(); // At most we can reference 2 inputs in the final shuffle. if (SV0.isUndef() || SV0 == ExtVec) { SV0 = ExtVec; for (int i = 0; i != NumOpElts; ++i) Mask.push_back(i + ExtIdx); } else if (SV1.isUndef() || SV1 == ExtVec) { SV1 = ExtVec; for (int i = 0; i != NumOpElts; ++i) Mask.push_back(i + ExtIdx + NumElts); } else { return SDValue(); } } if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT)) return SDValue(); return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0), DAG.getBitcast(VT, SV1), Mask); } SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // If we only have one input vector, we don't need to do any concatenation. if (N->getNumOperands() == 1) return N->getOperand(0); // Check if all of the operands are undefs. EVT VT = N->getValueType(0); if (ISD::allOperandsUndef(N)) return DAG.getUNDEF(VT); // Optimize concat_vectors where all but the first of the vectors are undef. if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) { return Op.isUndef(); })) { SDValue In = N->getOperand(0); assert(In.getValueType().isVector() && "Must concat vectors"); // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr). if (In->getOpcode() == ISD::BITCAST && !In->getOperand(0).getValueType().isVector()) { SDValue Scalar = In->getOperand(0); // If the bitcast type isn't legal, it might be a trunc of a legal type; // look through the trunc so we can still do the transform: // concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar) if (Scalar->getOpcode() == ISD::TRUNCATE && !TLI.isTypeLegal(Scalar.getValueType()) && TLI.isTypeLegal(Scalar->getOperand(0).getValueType())) Scalar = Scalar->getOperand(0); EVT SclTy = Scalar->getValueType(0); if (!SclTy.isFloatingPoint() && !SclTy.isInteger()) return SDValue(); // Bail out if the vector size is not a multiple of the scalar size. if (VT.getSizeInBits() % SclTy.getSizeInBits()) return SDValue(); unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits(); if (VNTNumElms < 2) return SDValue(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms); if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType())) return SDValue(); SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar); return DAG.getBitcast(VT, Res); } } // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR. // We have already tested above for an UNDEF only concatenation. // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) // -> (BUILD_VECTOR A, B, ..., C, D, ...) auto IsBuildVectorOrUndef = [](const SDValue &Op) { return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode(); }; if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) { SmallVector Opnds; EVT SVT = VT.getScalarType(); EVT MinVT = SVT; if (!SVT.isFloatingPoint()) { // If BUILD_VECTOR are from built from integer, they may have different // operand types. Get the smallest type and truncate all operands to it. bool FoundMinVT = false; for (const SDValue &Op : N->ops()) if (ISD::BUILD_VECTOR == Op.getOpcode()) { EVT OpSVT = Op.getOperand(0).getValueType(); MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT; FoundMinVT = true; } assert(FoundMinVT && "Concat vector type mismatch"); } for (const SDValue &Op : N->ops()) { EVT OpVT = Op.getValueType(); unsigned NumElts = OpVT.getVectorNumElements(); if (ISD::UNDEF == Op.getOpcode()) Opnds.append(NumElts, DAG.getUNDEF(MinVT)); if (ISD::BUILD_VECTOR == Op.getOpcode()) { if (SVT.isFloatingPoint()) { assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch"); Opnds.append(Op->op_begin(), Op->op_begin() + NumElts); } else { for (unsigned i = 0; i != NumElts; ++i) Opnds.push_back( DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i))); } } } assert(VT.getVectorNumElements() == Opnds.size() && "Concat vector type mismatch"); return DAG.getBuildVector(VT, SDLoc(N), Opnds); } // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR. if (SDValue V = combineConcatVectorOfScalars(N, DAG)) return V; // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE. if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) if (SDValue V = combineConcatVectorOfExtracts(N, DAG)) return V; // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR // nodes often generate nop CONCAT_VECTOR nodes. // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that // place the incoming vectors at the exact same location. SDValue SingleSource = SDValue(); unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements(); for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDValue Op = N->getOperand(i); if (Op.isUndef()) continue; // Check if this is the identity extract: if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR) return SDValue(); // Find the single incoming vector for the extract_subvector. if (SingleSource.getNode()) { if (Op.getOperand(0) != SingleSource) return SDValue(); } else { SingleSource = Op.getOperand(0); // Check the source type is the same as the type of the result. // If not, this concat may extend the vector, so we can not // optimize it away. if (SingleSource.getValueType() != N->getValueType(0)) return SDValue(); } unsigned IdentityIndex = i * PartNumElem; ConstantSDNode *CS = dyn_cast(Op.getOperand(1)); // The extract index must be constant. if (!CS) return SDValue(); // Check that we are reading from the identity index. if (CS->getZExtValue() != IdentityIndex) return SDValue(); } if (SingleSource.getNode()) return SingleSource; return SDValue(); } /// If we are extracting a subvector produced by a wide binary operator with at /// at least one operand that was the result of a vector concatenation, then try /// to use the narrow vector operands directly to avoid the concatenation and /// extraction. static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) { // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share // some of these bailouts with other transforms. // The extract index must be a constant, so we can map it to a concat operand. auto *ExtractIndex = dyn_cast(Extract->getOperand(1)); if (!ExtractIndex) return SDValue(); // Only handle the case where we are doubling and then halving. A larger ratio // may require more than two narrow binops to replace the wide binop. EVT VT = Extract->getValueType(0); unsigned NumElems = VT.getVectorNumElements(); assert((ExtractIndex->getZExtValue() % NumElems) == 0 && "Extract index is not a multiple of the vector length."); if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2) return SDValue(); // We are looking for an optionally bitcasted wide vector binary operator // feeding an extract subvector. SDValue BinOp = peekThroughBitcast(Extract->getOperand(0)); // TODO: The motivating case for this transform is an x86 AVX1 target. That // target has temptingly almost legal versions of bitwise logic ops in 256-bit // flavors, but no other 256-bit integer support. This could be extended to // handle any binop, but that may require fixing/adding other folds to avoid // codegen regressions. unsigned BOpcode = BinOp.getOpcode(); if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR) return SDValue(); // The binop must be a vector type, so we can chop it in half. EVT WideBVT = BinOp.getValueType(); if (!WideBVT.isVector()) return SDValue(); // Bail out if the target does not support a narrower version of the binop. EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(), WideBVT.getVectorNumElements() / 2); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT)) return SDValue(); // Peek through bitcasts of the binary operator operands if needed. SDValue LHS = peekThroughBitcast(BinOp.getOperand(0)); SDValue RHS = peekThroughBitcast(BinOp.getOperand(1)); // We need at least one concatenation operation of a binop operand to make // this transform worthwhile. The concat must double the input vector sizes. // TODO: Should we also handle INSERT_SUBVECTOR patterns? bool ConcatL = LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2; bool ConcatR = RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2; if (!ConcatL && !ConcatR) return SDValue(); // If one of the binop operands was not the result of a concat, we must // extract a half-sized operand for our new narrow binop. We can't just reuse // the original extract index operand because we may have bitcasted. unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems; unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements(); EVT ExtBOIdxVT = Extract->getOperand(1).getValueType(); SDLoc DL(Extract); // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N) // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum)) : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, BinOp.getOperand(0), DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum)) : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT, BinOp.getOperand(1), DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT)); SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y); return DAG.getBitcast(VT, NarrowBinOp); } /// If we are extracting a subvector from a wide vector load, convert to a /// narrow load to eliminate the extraction: /// (extract_subvector (load wide vector)) --> (load narrow vector) static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) { // TODO: Add support for big-endian. The offset calculation must be adjusted. if (DAG.getDataLayout().isBigEndian()) return SDValue(); // TODO: The one-use check is overly conservative. Check the cost of the // extract instead or remove that condition entirely. auto *Ld = dyn_cast(Extract->getOperand(0)); auto *ExtIdx = dyn_cast(Extract->getOperand(1)); if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() || !ExtIdx) return SDValue(); // The narrow load will be offset from the base address of the old load if // we are extracting from something besides index 0 (little-endian). EVT VT = Extract->getValueType(0); SDLoc DL(Extract); SDValue BaseAddr = Ld->getOperand(1); unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize(); // TODO: Use "BaseIndexOffset" to make this more effective. SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL); MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset, VT.getStoreSize()); SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO); DAG.makeEquivalentMemoryOrdering(Ld, NewLd); return NewLd; } SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) { EVT NVT = N->getValueType(0); SDValue V = N->getOperand(0); // Extract from UNDEF is UNDEF. if (V.isUndef()) return DAG.getUNDEF(NVT); if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT)) if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG)) return NarrowLoad; // Combine: // (extract_subvec (concat V1, V2, ...), i) // Into: // Vi if possible // Only operand 0 is checked as 'concat' assumes all inputs of the same // type. if (V->getOpcode() == ISD::CONCAT_VECTORS && isa(N->getOperand(1)) && V->getOperand(0).getValueType() == NVT) { unsigned Idx = N->getConstantOperandVal(1); unsigned NumElems = NVT.getVectorNumElements(); assert((Idx % NumElems) == 0 && "IDX in concat is not a multiple of the result vector length."); return V->getOperand(Idx / NumElems); } // Skip bitcasting V = peekThroughBitcast(V); // If the input is a build vector. Try to make a smaller build vector. if (V->getOpcode() == ISD::BUILD_VECTOR) { if (auto *Idx = dyn_cast(N->getOperand(1))) { EVT InVT = V->getValueType(0); unsigned ExtractSize = NVT.getSizeInBits(); unsigned EltSize = InVT.getScalarSizeInBits(); // Only do this if we won't split any elements. if (ExtractSize % EltSize == 0) { unsigned NumElems = ExtractSize / EltSize; EVT EltVT = InVT.getVectorElementType(); EVT ExtractVT = NumElems == 1 ? EltVT : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems); if ((Level < AfterLegalizeDAG || (NumElems == 1 || TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) && (!LegalTypes || TLI.isTypeLegal(ExtractVT))) { unsigned IdxVal = (Idx->getZExtValue() * NVT.getScalarSizeInBits()) / EltSize; if (NumElems == 1) { SDValue Src = V->getOperand(IdxVal); if (EltVT != Src.getValueType()) Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src); return DAG.getBitcast(NVT, Src); } // Extract the pieces from the original build_vector. SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N), makeArrayRef(V->op_begin() + IdxVal, NumElems)); return DAG.getBitcast(NVT, BuildVec); } } } } if (V->getOpcode() == ISD::INSERT_SUBVECTOR) { // Handle only simple case where vector being inserted and vector // being extracted are of same size. EVT SmallVT = V->getOperand(1).getValueType(); if (!NVT.bitsEq(SmallVT)) return SDValue(); // Only handle cases where both indexes are constants. ConstantSDNode *ExtIdx = dyn_cast(N->getOperand(1)); ConstantSDNode *InsIdx = dyn_cast(V->getOperand(2)); if (InsIdx && ExtIdx) { // Combine: // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx) // Into: // indices are equal or bit offsets are equal => V1 // otherwise => (extract_subvec V1, ExtIdx) if (InsIdx->getZExtValue() * SmallVT.getScalarSizeInBits() == ExtIdx->getZExtValue() * NVT.getScalarSizeInBits()) return DAG.getBitcast(NVT, V->getOperand(1)); return DAG.getNode( ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, DAG.getBitcast(N->getOperand(0).getValueType(), V->getOperand(0)), N->getOperand(1)); } } if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG)) return NarrowBOp; if (SimplifyDemandedVectorElts(SDValue(N, 0))) return SDValue(N, 0); return SDValue(); } // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat, // or turn a shuffle of a single concat into simpler shuffle then concat. static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); ShuffleVectorSDNode *SVN = cast(N); SmallVector Ops; EVT ConcatVT = N0.getOperand(0).getValueType(); unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements(); unsigned NumConcats = NumElts / NumElemsPerConcat; // Special case: shuffle(concat(A,B)) can be more efficiently represented // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high // half vector elements. if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() && std::all_of(SVN->getMask().begin() + NumElemsPerConcat, SVN->getMask().end(), [](int i) { return i == -1; })) { N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1), makeArrayRef(SVN->getMask().begin(), NumElemsPerConcat)); N1 = DAG.getUNDEF(ConcatVT); return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1); } // Look at every vector that's inserted. We're looking for exact // subvector-sized copies from a concatenated vector for (unsigned I = 0; I != NumConcats; ++I) { // Make sure we're dealing with a copy. unsigned Begin = I * NumElemsPerConcat; bool AllUndef = true, NoUndef = true; for (unsigned J = Begin; J != Begin + NumElemsPerConcat; ++J) { if (SVN->getMaskElt(J) >= 0) AllUndef = false; else NoUndef = false; } if (NoUndef) { if (SVN->getMaskElt(Begin) % NumElemsPerConcat != 0) return SDValue(); for (unsigned J = 1; J != NumElemsPerConcat; ++J) if (SVN->getMaskElt(Begin + J - 1) + 1 != SVN->getMaskElt(Begin + J)) return SDValue(); unsigned FirstElt = SVN->getMaskElt(Begin) / NumElemsPerConcat; if (FirstElt < N0.getNumOperands()) Ops.push_back(N0.getOperand(FirstElt)); else Ops.push_back(N1.getOperand(FirstElt - N0.getNumOperands())); } else if (AllUndef) { Ops.push_back(DAG.getUNDEF(N0.getOperand(0).getValueType())); } else { // Mixed with general masks and undefs, can't do optimization. return SDValue(); } } return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. // // SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always // a simplification in some sense, but it isn't appropriate in general: some // BUILD_VECTORs are substantially cheaper than others. The general case // of a BUILD_VECTOR requires inserting each element individually (or // performing the equivalent in a temporary stack variable). A BUILD_VECTOR of // all constants is a single constant pool load. A BUILD_VECTOR where each // element is identical is a splat. A BUILD_VECTOR where most of the operands // are undef lowers to a small number of element insertions. // // To deal with this, we currently use a bunch of mostly arbitrary heuristics. // We don't fold shuffles where one side is a non-zero constant, and we don't // fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate // non-constant operands. This seems to work out reasonably well in practice. static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN, SelectionDAG &DAG, const TargetLowering &TLI) { EVT VT = SVN->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); SDValue N0 = SVN->getOperand(0); SDValue N1 = SVN->getOperand(1); if (!N0->hasOneUse() || !N1->hasOneUse()) return SDValue(); // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as // discussed above. if (!N1.isUndef()) { bool N0AnyConst = isAnyConstantBuildVector(N0.getNode()); bool N1AnyConst = isAnyConstantBuildVector(N1.getNode()); if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode())) return SDValue(); if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode())) return SDValue(); } // If both inputs are splats of the same value then we can safely merge this // to a single BUILD_VECTOR with undef elements based on the shuffle mask. bool IsSplat = false; auto *BV0 = dyn_cast(N0); auto *BV1 = dyn_cast(N1); if (BV0 && BV1) if (SDValue Splat0 = BV0->getSplatValue()) IsSplat = (Splat0 == BV1->getSplatValue()); SmallVector Ops; SmallSet DuplicateOps; for (int M : SVN->getMask()) { SDValue Op = DAG.getUNDEF(VT.getScalarType()); if (M >= 0) { int Idx = M < (int)NumElts ? M : M - NumElts; SDValue &S = (M < (int)NumElts ? N0 : N1); if (S.getOpcode() == ISD::BUILD_VECTOR) { Op = S.getOperand(Idx); } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) { assert(Idx == 0 && "Unexpected SCALAR_TO_VECTOR operand index."); Op = S.getOperand(0); } else { // Operand can't be combined - bail out. return SDValue(); } } // Don't duplicate a non-constant BUILD_VECTOR operand unless we're // generating a splat; semantically, this is fine, but it's likely to // generate low-quality code if the target can't reconstruct an appropriate // shuffle. if (!Op.isUndef() && !isa(Op) && !isa(Op)) if (!IsSplat && !DuplicateOps.insert(Op).second) return SDValue(); Ops.push_back(Op); } // BUILD_VECTOR requires all inputs to be of the same type, find the // maximum type and extend them all. EVT SVT = VT.getScalarType(); if (SVT.isInteger()) for (SDValue &Op : Ops) SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT); if (SVT != VT.getScalarType()) for (SDValue &Op : Ops) Op = TLI.isZExtFree(Op.getValueType(), SVT) ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT) : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT); return DAG.getBuildVector(VT, SDLoc(SVN), Ops); } // Match shuffles that can be converted to any_vector_extend_in_reg. // This is often generated during legalization. // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)) // TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case. static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN, SelectionDAG &DAG, const TargetLowering &TLI, bool LegalOperations, bool LegalTypes) { EVT VT = SVN->getValueType(0); bool IsBigEndian = DAG.getDataLayout().isBigEndian(); // TODO Add support for big-endian when we have a test case. if (!VT.isInteger() || IsBigEndian) return SDValue(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); ArrayRef Mask = SVN->getMask(); SDValue N0 = SVN->getOperand(0); // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32)) auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) { for (unsigned i = 0; i != NumElts; ++i) { if (Mask[i] < 0) continue; if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale)) continue; return false; } return true; }; // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for // power-of-2 extensions as they are the most likely. for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) { // Check for non power of 2 vector sizes if (NumElts % Scale != 0) continue; if (!isAnyExtend(Scale)) continue; EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale); EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale); if (!LegalTypes || TLI.isTypeLegal(OutVT)) if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT)) return DAG.getBitcast(VT, DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT)); } return SDValue(); } // Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of // each source element of a large type into the lowest elements of a smaller // destination type. This is often generated during legalization. // If the source node itself was a '*_extend_vector_inreg' node then we should // then be able to remove it. static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) { EVT VT = SVN->getValueType(0); bool IsBigEndian = DAG.getDataLayout().isBigEndian(); // TODO Add support for big-endian when we have a test case. if (!VT.isInteger() || IsBigEndian) return SDValue(); SDValue N0 = peekThroughBitcast(SVN->getOperand(0)); unsigned Opcode = N0.getOpcode(); if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG && Opcode != ISD::SIGN_EXTEND_VECTOR_INREG && Opcode != ISD::ZERO_EXTEND_VECTOR_INREG) return SDValue(); SDValue N00 = N0.getOperand(0); ArrayRef Mask = SVN->getMask(); unsigned NumElts = VT.getVectorNumElements(); unsigned EltSizeInBits = VT.getScalarSizeInBits(); unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits(); unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits(); if (ExtDstSizeInBits % ExtSrcSizeInBits != 0) return SDValue(); unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits; // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1> // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1> // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1> auto isTruncate = [&Mask, &NumElts](unsigned Scale) { for (unsigned i = 0; i != NumElts; ++i) { if (Mask[i] < 0) continue; if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale)) continue; return false; } return true; }; // At the moment we just handle the case where we've truncated back to the // same size as before the extension. // TODO: handle more extension/truncation cases as cases arise. if (EltSizeInBits != ExtSrcSizeInBits) return SDValue(); // We can remove *extend_vector_inreg only if the truncation happens at // the same scale as the extension. if (isTruncate(ExtScale)) return DAG.getBitcast(VT, N00); return SDValue(); } // Combine shuffles of splat-shuffles of the form: // shuffle (shuffle V, undef, splat-mask), undef, M // If splat-mask contains undef elements, we need to be careful about // introducing undef's in the folded mask which are not the result of composing // the masks of the shuffles. static SDValue combineShuffleOfSplat(ArrayRef UserMask, ShuffleVectorSDNode *Splat, SelectionDAG &DAG) { ArrayRef SplatMask = Splat->getMask(); assert(UserMask.size() == SplatMask.size() && "Mask length mismatch"); // Prefer simplifying to the splat-shuffle, if possible. This is legal if // every undef mask element in the splat-shuffle has a corresponding undef // element in the user-shuffle's mask or if the composition of mask elements // would result in undef. // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask): // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u] // In this case it is not legal to simplify to the splat-shuffle because we // may be exposing the users of the shuffle an undef element at index 1 // which was not there before the combine. // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u] // In this case the composition of masks yields SplatMask, so it's ok to // simplify to the splat-shuffle. // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u] // In this case the composed mask includes all undef elements of SplatMask // and in addition sets element zero to undef. It is safe to simplify to // the splat-shuffle. auto CanSimplifyToExistingSplat = [](ArrayRef UserMask, ArrayRef SplatMask) { for (unsigned i = 0, e = UserMask.size(); i != e; ++i) if (UserMask[i] != -1 && SplatMask[i] == -1 && SplatMask[UserMask[i]] != -1) return false; return true; }; if (CanSimplifyToExistingSplat(UserMask, SplatMask)) return SDValue(Splat, 0); // Create a new shuffle with a mask that is composed of the two shuffles' // masks. SmallVector NewMask; for (int Idx : UserMask) NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), Splat->getOperand(0), Splat->getOperand(1), NewMask); } /// If the shuffle mask is taking exactly one element from the first vector /// operand and passing through all other elements from the second vector /// operand, return the index of the mask element that is choosing an element /// from the first operand. Otherwise, return -1. static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef Mask) { int MaskSize = Mask.size(); int EltFromOp0 = -1; // TODO: This does not match if there are undef elements in the shuffle mask. // Should we ignore undefs in the shuffle mask instead? The trade-off is // removing an instruction (a shuffle), but losing the knowledge that some // vector lanes are not needed. for (int i = 0; i != MaskSize; ++i) { if (Mask[i] >= 0 && Mask[i] < MaskSize) { // We're looking for a shuffle of exactly one element from operand 0. if (EltFromOp0 != -1) return -1; EltFromOp0 = i; } else if (Mask[i] != i + MaskSize) { // Nothing from operand 1 can change lanes. return -1; } } return EltFromOp0; } /// If a shuffle inserts exactly one element from a source vector operand into /// another vector operand and we can access the specified element as a scalar, /// then we can eliminate the shuffle. static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { // First, check if we are taking one element of a vector and shuffling that // element into another vector. ArrayRef Mask = Shuf->getMask(); SmallVector CommutedMask(Mask.begin(), Mask.end()); SDValue Op0 = Shuf->getOperand(0); SDValue Op1 = Shuf->getOperand(1); int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask); if (ShufOp0Index == -1) { // Commute mask and check again. ShuffleVectorSDNode::commuteMask(CommutedMask); ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask); if (ShufOp0Index == -1) return SDValue(); // Commute operands to match the commuted shuffle mask. std::swap(Op0, Op1); Mask = CommutedMask; } // The shuffle inserts exactly one element from operand 0 into operand 1. // Now see if we can access that element as a scalar via a real insert element // instruction. // TODO: We can try harder to locate the element as a scalar. Examples: it // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant. assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() && "Shuffle mask value must be from operand 0"); if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT) return SDValue(); auto *InsIndexC = dyn_cast(Op0.getOperand(2)); if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index]) return SDValue(); // There's an existing insertelement with constant insertion index, so we // don't need to check the legality/profitability of a replacement operation // that differs at most in the constant value. The target should be able to // lower any of those in a similar way. If not, legalization will expand this // to a scalar-to-vector plus shuffle. // // Note that the shuffle may move the scalar from the position that the insert // element used. Therefore, our new insert element occurs at the shuffle's // mask index value, not the insert's index value. // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C' SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf), Op0.getOperand(2).getValueType()); return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(), Op1, Op0.getOperand(1), NewInsIndex); } SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG"); // Canonicalize shuffle undef, undef -> undef if (N0.isUndef() && N1.isUndef()) return DAG.getUNDEF(VT); ShuffleVectorSDNode *SVN = cast(N); // Canonicalize shuffle v, v -> v, undef if (N0 == N1) { SmallVector NewMask; for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx >= (int)NumElts) Idx -= NumElts; NewMask.push_back(Idx); } return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask); } // Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask. if (N0.isUndef()) return DAG.getCommutedVectorShuffle(*SVN); // Remove references to rhs if it is undef if (N1.isUndef()) { bool Changed = false; SmallVector NewMask; for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx >= (int)NumElts) { Idx = -1; Changed = true; } NewMask.push_back(Idx); } if (Changed) return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); } if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG)) return InsElt; // A shuffle of a single vector that is a splat can always be folded. if (auto *N0Shuf = dyn_cast(N0)) if (N1->isUndef() && N0Shuf->isSplat()) return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG); // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { SDNode *V = N0.getNode(); // If this is a bit convert that changes the element type of the vector but // not the number of vector elements, look through it. Be careful not to // look though conversions that change things like v4f32 to v2f64. if (V->getOpcode() == ISD::BITCAST) { SDValue ConvInput = V->getOperand(0); if (ConvInput.getValueType().isVector() && ConvInput.getValueType().getVectorNumElements() == NumElts) V = ConvInput.getNode(); } if (V->getOpcode() == ISD::BUILD_VECTOR) { assert(V->getNumOperands() == NumElts && "BUILD_VECTOR has wrong number of operands"); SDValue Base; bool AllSame = true; for (unsigned i = 0; i != NumElts; ++i) { if (!V->getOperand(i).isUndef()) { Base = V->getOperand(i); break; } } // Splat of , return if (!Base.getNode()) return N0; for (unsigned i = 0; i != NumElts; ++i) { if (V->getOperand(i) != Base) { AllSame = false; break; } } // Splat of , return if (AllSame) return N0; // Canonicalize any other splat as a build_vector. const SDValue &Splatted = V->getOperand(SVN->getSplatIndex()); SmallVector Ops(NumElts, Splatted); SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops); // We may have jumped through bitcasts, so the type of the // BUILD_VECTOR may not match the type of the shuffle. if (V->getValueType(0) != VT) NewBV = DAG.getBitcast(VT, NewBV); return NewBV; } } // Simplify source operands based on shuffle mask. if (SimplifyDemandedVectorElts(SDValue(N, 0))) return SDValue(N, 0); // Match shuffles that can be converted to any_vector_extend_in_reg. if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes)) return V; // Combine "truncate_vector_in_reg" style shuffles. if (SDValue V = combineTruncationShuffle(SVN, DAG)) return V; if (N0.getOpcode() == ISD::CONCAT_VECTORS && Level < AfterLegalizeVectorOps && (N1.isUndef() || (N1.getOpcode() == ISD::CONCAT_VECTORS && N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) { if (SDValue V = partitionShuffleOfConcats(N, DAG)) return V; } // Attempt to combine a shuffle of 2 inputs of 'scalar sources' - // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR. if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI)) return Res; // If this shuffle only has a single input that is a bitcasted shuffle, // attempt to merge the 2 shuffles and suitably bitcast the inputs/output // back to their original types. if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && N1.isUndef() && Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { // Peek through the bitcast only if there is one user. SDValue BC0 = N0; while (BC0.getOpcode() == ISD::BITCAST) { if (!BC0.hasOneUse()) break; BC0 = BC0.getOperand(0); } auto ScaleShuffleMask = [](ArrayRef Mask, int Scale) { if (Scale == 1) return SmallVector(Mask.begin(), Mask.end()); SmallVector NewMask; for (int M : Mask) for (int s = 0; s != Scale; ++s) NewMask.push_back(M < 0 ? -1 : Scale * M + s); return NewMask; }; if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) { EVT SVT = VT.getScalarType(); EVT InnerVT = BC0->getValueType(0); EVT InnerSVT = InnerVT.getScalarType(); // Determine which shuffle works with the smaller scalar type. EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT; EVT ScaleSVT = ScaleVT.getScalarType(); if (TLI.isTypeLegal(ScaleVT) && 0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) && 0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) { int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits(); int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits(); // Scale the shuffle masks to the smaller scalar type. ShuffleVectorSDNode *InnerSVN = cast(BC0); SmallVector InnerMask = ScaleShuffleMask(InnerSVN->getMask(), InnerScale); SmallVector OuterMask = ScaleShuffleMask(SVN->getMask(), OuterScale); // Merge the shuffle masks. SmallVector NewMask; for (int M : OuterMask) NewMask.push_back(M < 0 ? -1 : InnerMask[M]); // Test for shuffle mask legality over both commutations. SDValue SV0 = BC0->getOperand(0); SDValue SV1 = BC0->getOperand(1); bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); if (!LegalMask) { std::swap(SV0, SV1); ShuffleVectorSDNode::commuteMask(NewMask); LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); } if (LegalMask) { SV0 = DAG.getBitcast(ScaleVT, SV0); SV1 = DAG.getBitcast(ScaleVT, SV1); return DAG.getBitcast( VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask)); } } } } // Canonicalize shuffles according to rules: // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B) if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { // The incoming shuffle must be of the same type as the result of the // current shuffle. assert(N1->getOperand(0).getValueType() == VT && "Shuffle types don't match"); SDValue SV0 = N1->getOperand(0); SDValue SV1 = N1->getOperand(1); bool HasSameOp0 = N0 == SV0; bool IsSV1Undef = SV1.isUndef(); if (HasSameOp0 || IsSV1Undef || N0 == SV1) // Commute the operands of this shuffle so that next rule // will trigger. return DAG.getCommutedVectorShuffle(*SVN); } // Try to fold according to rules: // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) // Don't try to fold shuffles with illegal type. // Only fold if this shuffle is the only user of the other shuffle. if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) && Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) { ShuffleVectorSDNode *OtherSV = cast(N0); // Don't try to fold splats; they're likely to simplify somehow, or they // might be free. if (OtherSV->isSplat()) return SDValue(); // The incoming shuffle must be of the same type as the result of the // current shuffle. assert(OtherSV->getOperand(0).getValueType() == VT && "Shuffle types don't match"); SDValue SV0, SV1; SmallVector Mask; // Compute the combined shuffle mask for a shuffle with SV0 as the first // operand, and SV1 as the second operand. for (unsigned i = 0; i != NumElts; ++i) { int Idx = SVN->getMaskElt(i); if (Idx < 0) { // Propagate Undef. Mask.push_back(Idx); continue; } SDValue CurrentVec; if (Idx < (int)NumElts) { // This shuffle index refers to the inner shuffle N0. Lookup the inner // shuffle mask to identify which vector is actually referenced. Idx = OtherSV->getMaskElt(Idx); if (Idx < 0) { // Propagate Undef. Mask.push_back(Idx); continue; } CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0) : OtherSV->getOperand(1); } else { // This shuffle index references an element within N1. CurrentVec = N1; } // Simple case where 'CurrentVec' is UNDEF. if (CurrentVec.isUndef()) { Mask.push_back(-1); continue; } // Canonicalize the shuffle index. We don't know yet if CurrentVec // will be the first or second operand of the combined shuffle. Idx = Idx % NumElts; if (!SV0.getNode() || SV0 == CurrentVec) { // Ok. CurrentVec is the left hand side. // Update the mask accordingly. SV0 = CurrentVec; Mask.push_back(Idx); continue; } // Bail out if we cannot convert the shuffle pair into a single shuffle. if (SV1.getNode() && SV1 != CurrentVec) return SDValue(); // Ok. CurrentVec is the right hand side. // Update the mask accordingly. SV1 = CurrentVec; Mask.push_back(Idx + NumElts); } // Check if all indices in Mask are Undef. In case, propagate Undef. bool isUndefMask = true; for (unsigned i = 0; i != NumElts && isUndefMask; ++i) isUndefMask &= Mask[i] < 0; if (isUndefMask) return DAG.getUNDEF(VT); if (!SV0.getNode()) SV0 = DAG.getUNDEF(VT); if (!SV1.getNode()) SV1 = DAG.getUNDEF(VT); // Avoid introducing shuffles with illegal mask. if (!TLI.isShuffleMaskLegal(Mask, VT)) { ShuffleVectorSDNode::commuteMask(Mask); if (!TLI.isShuffleMaskLegal(Mask, VT)) return SDValue(); // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2) std::swap(SV0, SV1); } // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2) // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2) return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask); } return SDValue(); } SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) { SDValue InVal = N->getOperand(0); EVT VT = N->getValueType(0); // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern // with a VECTOR_SHUFFLE and possible truncate. if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue InVec = InVal->getOperand(0); SDValue EltNo = InVal->getOperand(1); auto InVecT = InVec.getValueType(); if (ConstantSDNode *C0 = dyn_cast(EltNo)) { SmallVector NewMask(InVecT.getVectorNumElements(), -1); int Elt = C0->getZExtValue(); NewMask[0] = Elt; SDValue Val; // If we have an implict truncate do truncate here as long as it's legal. // if it's not legal, this should if (VT.getScalarType() != InVal.getValueType() && InVal.getValueType().isScalarInteger() && isTypeLegal(VT.getScalarType())) { Val = DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal); return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val); } if (VT.getScalarType() == InVecT.getScalarType() && VT.getVectorNumElements() <= InVecT.getVectorNumElements() && TLI.isShuffleMaskLegal(NewMask, VT)) { Val = DAG.getVectorShuffle(InVecT, SDLoc(N), InVec, DAG.getUNDEF(InVecT), NewMask); // If the initial vector is the correct size this shuffle is a // valid result. if (VT == InVecT) return Val; // If not we must truncate the vector. if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) { MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout()); SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy); EVT SubVT = EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(), VT.getVectorNumElements()); Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, Val, ZeroIdx); return Val; } } } } return SDValue(); } SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); // If inserting an UNDEF, just return the original vector. if (N1.isUndef()) return N0; // For nested INSERT_SUBVECTORs, attempt to combine inner node first to allow // us to pull BITCASTs from input to output. if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR) if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode())) return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2); // If this is an insert of an extracted vector into an undef vector, we can // just use the input to the extract. if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT) return N1.getOperand(0); // If we are inserting a bitcast value into an undef, with the same // number of elements, just use the bitcast input of the extract. // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 -> // BITCAST (INSERT_SUBVECTOR UNDEF N1 N2) if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST && N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR && N1.getOperand(0).getOperand(1) == N2 && N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() == VT.getSizeInBits()) { return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0)); } // If both N1 and N2 are bitcast values on which insert_subvector // would makes sense, pull the bitcast through. // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 -> // BITCAST (INSERT_SUBVECTOR N0 N1 N2) if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) { SDValue CN0 = N0.getOperand(0); SDValue CN1 = N1.getOperand(0); EVT CN0VT = CN0.getValueType(); EVT CN1VT = CN1.getValueType(); if (CN0VT.isVector() && CN1VT.isVector() && CN0VT.getVectorElementType() == CN1VT.getVectorElementType() && CN0VT.getVectorNumElements() == VT.getVectorNumElements()) { SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), CN0.getValueType(), CN0, CN1, N2); return DAG.getBitcast(VT, NewINSERT); } } // Combine INSERT_SUBVECTORs where we are inserting to the same index. // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx ) // --> INSERT_SUBVECTOR( Vec, SubNew, Idx ) if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(1).getValueType() == N1.getValueType() && N0.getOperand(2) == N2) return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), N1, N2); if (!isa(N2)) return SDValue(); unsigned InsIdx = cast(N2)->getZExtValue(); // Canonicalize insert_subvector dag nodes. // Example: // (insert_subvector (insert_subvector A, Idx0), Idx1) // -> (insert_subvector (insert_subvector A, Idx1), Idx0) if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() && N1.getValueType() == N0.getOperand(1).getValueType() && isa(N0.getOperand(2))) { unsigned OtherIdx = N0.getConstantOperandVal(2); if (InsIdx < OtherIdx) { // Swap nodes. SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), N1, N2); AddToWorklist(NewOp.getNode()); return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()), VT, NewOp, N0.getOperand(1), N0.getOperand(2)); } } // If the input vector is a concatenation, and the insert replaces // one of the pieces, we can optimize into a single concat_vectors. if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() && N0.getOperand(0).getValueType() == N1.getValueType()) { unsigned Factor = N1.getValueType().getVectorNumElements(); SmallVector Ops(N0->op_begin(), N0->op_end()); Ops[cast(N2)->getZExtValue() / Factor] = N1; return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops); } return SDValue(); } SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) { SDValue N0 = N->getOperand(0); // fold (fp_to_fp16 (fp16_to_fp op)) -> op if (N0->getOpcode() == ISD::FP16_TO_FP) return N0->getOperand(0); return SDValue(); } SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) { SDValue N0 = N->getOperand(0); // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op) if (N0->getOpcode() == ISD::AND) { ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1)); if (AndConst && AndConst->getAPIntValue() == 0xffff) { return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), N0.getOperand(0)); } } return SDValue(); } /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle /// with the destination vector and a zero vector. /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==> /// vector_shuffle V, Zero, <0, 4, 2, 4> SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) { assert(N->getOpcode() == ISD::AND && "Unexpected opcode!"); EVT VT = N->getValueType(0); SDValue LHS = N->getOperand(0); SDValue RHS = peekThroughBitcast(N->getOperand(1)); SDLoc DL(N); // Make sure we're not running after operation legalization where it // may have custom lowered the vector shuffles. if (LegalOperations) return SDValue(); if (RHS.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); EVT RVT = RHS.getValueType(); unsigned NumElts = RHS.getNumOperands(); // Attempt to create a valid clear mask, splitting the mask into // sub elements and checking to see if each is // all zeros or all ones - suitable for shuffle masking. auto BuildClearMask = [&](int Split) { int NumSubElts = NumElts * Split; int NumSubBits = RVT.getScalarSizeInBits() / Split; SmallVector Indices; for (int i = 0; i != NumSubElts; ++i) { int EltIdx = i / Split; int SubIdx = i % Split; SDValue Elt = RHS.getOperand(EltIdx); if (Elt.isUndef()) { Indices.push_back(-1); continue; } APInt Bits; if (isa(Elt)) Bits = cast(Elt)->getAPIntValue(); else if (isa(Elt)) Bits = cast(Elt)->getValueAPF().bitcastToAPInt(); else return SDValue(); // Extract the sub element from the constant bit mask. if (DAG.getDataLayout().isBigEndian()) { Bits.lshrInPlace((Split - SubIdx - 1) * NumSubBits); } else { Bits.lshrInPlace(SubIdx * NumSubBits); } if (Split > 1) Bits = Bits.trunc(NumSubBits); if (Bits.isAllOnesValue()) Indices.push_back(i); else if (Bits == 0) Indices.push_back(i + NumSubElts); else return SDValue(); } // Let's see if the target supports this vector_shuffle. EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits); EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts); if (!TLI.isVectorClearMaskLegal(Indices, ClearVT)) return SDValue(); SDValue Zero = DAG.getConstant(0, DL, ClearVT); return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL, DAG.getBitcast(ClearVT, LHS), Zero, Indices)); }; // Determine maximum split level (byte level masking). int MaxSplit = 1; if (RVT.getScalarSizeInBits() % 8 == 0) MaxSplit = RVT.getScalarSizeInBits() / 8; for (int Split = 1; Split <= MaxSplit; ++Split) if (RVT.getScalarSizeInBits() % Split == 0) if (SDValue S = BuildClearMask(Split)) return S; return SDValue(); } /// Visit a binary vector operation, like ADD. SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) { assert(N->getValueType(0).isVector() && "SimplifyVBinOp only works on vectors!"); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); SDValue Ops[] = {LHS, RHS}; // See if we can constant fold the vector operation. if (SDValue Fold = DAG.FoldConstantVectorArithmetic( N->getOpcode(), SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags())) return Fold; // Type legalization might introduce new shuffles in the DAG. // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask))) // -> (shuffle (VBinOp (A, B)), Undef, Mask). if (LegalTypes && isa(LHS) && isa(RHS) && LHS.hasOneUse() && RHS.hasOneUse() && LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef()) { ShuffleVectorSDNode *SVN0 = cast(LHS); ShuffleVectorSDNode *SVN1 = cast(RHS); if (SVN0->getMask().equals(SVN1->getMask())) { EVT VT = N->getValueType(0); SDValue UndefVector = LHS.getOperand(1); SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS.getOperand(0), RHS.getOperand(0), N->getFlags()); AddUsersToWorklist(N); return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector, SVN0->getMask()); } } return SDValue(); } SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2) { assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!"); SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2, cast(N0.getOperand(2))->get()); // If we got a simplified select_cc node back from SimplifySelectCC, then // break it down into a new SETCC node, and a new SELECT node, and then return // the SELECT node, since we were called with a SELECT node. if (SCC.getNode()) { // Check to see if we got a select_cc back (to turn into setcc/select). // Otherwise, just return whatever node we got back, like fabs. if (SCC.getOpcode() == ISD::SELECT_CC) { SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0), N0.getValueType(), SCC.getOperand(0), SCC.getOperand(1), SCC.getOperand(4)); AddToWorklist(SETCC.getNode()); return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC, SCC.getOperand(2), SCC.getOperand(3)); } return SCC; } return SDValue(); } /// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values /// being selected between, see if we can simplify the select. Callers of this /// should assume that TheSelect is deleted if this returns true. As such, they /// should return the appropriate thing (e.g. the node) back to the top-level of /// the DAG combiner loop to avoid it being looked at. bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS, SDValue RHS) { // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) // The select + setcc is redundant, because fsqrt returns NaN for X < 0. if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) { if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) { // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?)) SDValue Sqrt = RHS; ISD::CondCode CC; SDValue CmpLHS; const ConstantFPSDNode *Zero = nullptr; if (TheSelect->getOpcode() == ISD::SELECT_CC) { CC = cast(TheSelect->getOperand(4))->get(); CmpLHS = TheSelect->getOperand(0); Zero = isConstOrConstSplatFP(TheSelect->getOperand(1)); } else { // SELECT or VSELECT SDValue Cmp = TheSelect->getOperand(0); if (Cmp.getOpcode() == ISD::SETCC) { CC = cast(Cmp.getOperand(2))->get(); CmpLHS = Cmp.getOperand(0); Zero = isConstOrConstSplatFP(Cmp.getOperand(1)); } } if (Zero && Zero->isZero() && Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT || CC == ISD::SETULT || CC == ISD::SETLT)) { // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x)) CombineTo(TheSelect, Sqrt); return true; } } } // Cannot simplify select with vector condition if (TheSelect->getOperand(0).getValueType().isVector()) return false; // If this is a select from two identical things, try to pull the operation // through the select. if (LHS.getOpcode() != RHS.getOpcode() || !LHS.hasOneUse() || !RHS.hasOneUse()) return false; // If this is a load and the token chain is identical, replace the select // of two loads with a load through a select of the address to load from. // This triggers in things like "select bool X, 10.0, 123.0" after the FP // constants have been dropped into the constant pool. if (LHS.getOpcode() == ISD::LOAD) { LoadSDNode *LLD = cast(LHS); LoadSDNode *RLD = cast(RHS); // Token chains must be identical. if (LHS.getOperand(0) != RHS.getOperand(0) || // Do not let this transformation reduce the number of volatile loads. LLD->isVolatile() || RLD->isVolatile() || // FIXME: If either is a pre/post inc/dec load, // we'd need to split out the address adjustment. LLD->isIndexed() || RLD->isIndexed() || // If this is an EXTLOAD, the VT's must match. LLD->getMemoryVT() != RLD->getMemoryVT() || // If this is an EXTLOAD, the kind of extension must match. (LLD->getExtensionType() != RLD->getExtensionType() && // The only exception is if one of the extensions is anyext. LLD->getExtensionType() != ISD::EXTLOAD && RLD->getExtensionType() != ISD::EXTLOAD) || // FIXME: this discards src value information. This is // over-conservative. It would be beneficial to be able to remember // both potential memory locations. Since we are discarding // src value info, don't do the transformation if the memory // locations are not in the default address space. LLD->getPointerInfo().getAddrSpace() != 0 || RLD->getPointerInfo().getAddrSpace() != 0 || !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(), LLD->getBasePtr().getValueType())) return false; // Check that the select condition doesn't reach either load. If so, // folding this will induce a cycle into the DAG. If not, this is safe to // xform, so create a select of the addresses. SDValue Addr; if (TheSelect->getOpcode() == ISD::SELECT) { SDNode *CondNode = TheSelect->getOperand(0).getNode(); if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) || (RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode))) return false; // The loads must not depend on one another. if (LLD->isPredecessorOf(RLD) || RLD->isPredecessorOf(LLD)) return false; Addr = DAG.getSelect(SDLoc(TheSelect), LLD->getBasePtr().getValueType(), TheSelect->getOperand(0), LLD->getBasePtr(), RLD->getBasePtr()); } else { // Otherwise SELECT_CC SDNode *CondLHS = TheSelect->getOperand(0).getNode(); SDNode *CondRHS = TheSelect->getOperand(1).getNode(); if ((LLD->hasAnyUseOfValue(1) && (LLD->isPredecessorOf(CondLHS) || LLD->isPredecessorOf(CondRHS))) || (RLD->hasAnyUseOfValue(1) && (RLD->isPredecessorOf(CondLHS) || RLD->isPredecessorOf(CondRHS)))) return false; Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect), LLD->getBasePtr().getValueType(), TheSelect->getOperand(0), TheSelect->getOperand(1), LLD->getBasePtr(), RLD->getBasePtr(), TheSelect->getOperand(4)); } SDValue Load; // It is safe to replace the two loads if they have different alignments, // but the new load must be the minimum (most restrictive) alignment of the // inputs. unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment()); MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags(); if (!RLD->isInvariant()) MMOFlags &= ~MachineMemOperand::MOInvariant; if (!RLD->isDereferenceable()) MMOFlags &= ~MachineMemOperand::MODereferenceable; if (LLD->getExtensionType() == ISD::NON_EXTLOAD) { // FIXME: Discards pointer and AA info. Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect), LLD->getChain(), Addr, MachinePointerInfo(), Alignment, MMOFlags); } else { // FIXME: Discards pointer and AA info. Load = DAG.getExtLoad( LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType() : LLD->getExtensionType(), SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr, MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags); } // Users of the select now use the result of the load. CombineTo(TheSelect, Load); // Users of the old loads now use the new load's chain. We know the // old-load value is dead now. CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1)); CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1)); return true; } return false; } /// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and /// bitwise 'and'. SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC) { // If this is a select where the false operand is zero and the compare is a // check of the sign bit, see if we can perform the "gzip trick": // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A EVT XType = N0.getValueType(); EVT AType = N2.getValueType(); if (!isNullConstant(N3) || !XType.bitsGE(AType)) return SDValue(); // If the comparison is testing for a positive value, we have to invert // the sign bit mask, so only do that transform if the target has a bitwise // 'and not' instruction (the invert is free). if (CC == ISD::SETGT && TLI.hasAndNot(N2)) { // (X > -1) ? A : 0 // (X > 0) ? X : 0 <-- This is canonical signed max. if (!(isAllOnesConstant(N1) || (isNullConstant(N1) && N0 == N2))) return SDValue(); } else if (CC == ISD::SETLT) { // (X < 0) ? A : 0 // (X < 1) ? X : 0 <-- This is un-canonicalized signed min. if (!(isNullConstant(N1) || (isOneConstant(N1) && N0 == N2))) return SDValue(); } else { return SDValue(); } // and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit // constant. EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType()); auto *N2C = dyn_cast(N2.getNode()); if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) { unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1; SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy); SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt); AddToWorklist(Shift.getNode()); if (XType.bitsGT(AType)) { Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); AddToWorklist(Shift.getNode()); } if (CC == ISD::SETGT) Shift = DAG.getNOT(DL, Shift, AType); return DAG.getNode(ISD::AND, DL, AType, Shift, N2); } SDValue ShiftAmt = DAG.getConstant(XType.getSizeInBits() - 1, DL, ShiftAmtTy); SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt); AddToWorklist(Shift.getNode()); if (XType.bitsGT(AType)) { Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift); AddToWorklist(Shift.getNode()); } if (CC == ISD::SETGT) Shift = DAG.getNOT(DL, Shift, AType); return DAG.getNode(ISD::AND, DL, AType, Shift, N2); } /// Simplify an expression of the form (N0 cond N1) ? N2 : N3 /// where 'cond' is the comparison specified by CC. SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3, ISD::CondCode CC, bool NotExtCompare) { // (x ? y : y) -> y. if (N2 == N3) return N2; EVT VT = N2.getValueType(); ConstantSDNode *N1C = dyn_cast(N1.getNode()); ConstantSDNode *N2C = dyn_cast(N2.getNode()); // Determine if the condition we're dealing with is constant SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1, CC, DL, false); if (SCC.getNode()) AddToWorklist(SCC.getNode()); if (ConstantSDNode *SCCC = dyn_cast_or_null(SCC.getNode())) { // fold select_cc true, x, y -> x // fold select_cc false, x, y -> y return !SCCC->isNullValue() ? N2 : N3; } // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)" // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0 // in it. This is a win when the constant is not otherwise available because // it replaces two constant pool loads with one. We only do this if the FP // type is known to be legal, because if it isn't, then we are before legalize // types an we want the other legalization to happen first (e.g. to avoid // messing with soft float) and if the ConstantFP is not legal, because if // it is legal, we may not need to store the FP constant in a constant pool. if (ConstantFPSDNode *TV = dyn_cast(N2)) if (ConstantFPSDNode *FV = dyn_cast(N3)) { if (TLI.isTypeLegal(N2.getValueType()) && (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) != TargetLowering::Legal && !TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) && !TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) && // If both constants have multiple uses, then we won't need to do an // extra load, they are likely around in registers for other users. (TV->hasOneUse() || FV->hasOneUse())) { Constant *Elts[] = { const_cast(FV->getConstantFPValue()), const_cast(TV->getConstantFPValue()) }; Type *FPTy = Elts[0]->getType(); const DataLayout &TD = DAG.getDataLayout(); // Create a ConstantArray of the two constants. Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts); SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()), TD.getPrefTypeAlignment(FPTy)); unsigned Alignment = cast(CPIdx)->getAlignment(); // Get the offsets to the 0 and 1 element of the array so that we can // select between them. SDValue Zero = DAG.getIntPtrConstant(0, DL); unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType()); SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV)); SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); AddToWorklist(Cond.getNode()); SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero); AddToWorklist(CstOffset.getNode()); CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset); AddToWorklist(CPIdx.getNode()); return DAG.getLoad( TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx, MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment); } } if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC)) return V; // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A) // where y is has a single bit set. // A plaintext description would be, we can turn the SELECT_CC into an AND // when the condition can be materialized as an all-ones register. Any // single bit-test can be materialized as an all-ones register with // shift-left and shift-right-arith. if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND && N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) { SDValue AndLHS = N0->getOperand(0); ConstantSDNode *ConstAndRHS = dyn_cast(N0->getOperand(1)); if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) { // Shift the tested bit over the sign bit. const APInt &AndMask = ConstAndRHS->getAPIntValue(); SDValue ShlAmt = DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS), getShiftAmountTy(AndLHS.getValueType())); SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt); // Now arithmetic right shift it all the way over, so the result is either // all-ones, or zero. SDValue ShrAmt = DAG.getConstant(AndMask.getBitWidth() - 1, SDLoc(Shl), getShiftAmountTy(Shl.getValueType())); SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt); return DAG.getNode(ISD::AND, DL, VT, Shr, N3); } } // fold select C, 16, 0 -> shl C, 4 if (N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2() && TLI.getBooleanContents(N0.getValueType()) == TargetLowering::ZeroOrOneBooleanContent) { // If the caller doesn't want us to simplify this into a zext of a compare, // don't do it. if (NotExtCompare && N2C->isOne()) return SDValue(); // Get a SetCC of the condition // NOTE: Don't create a SETCC if it's not legal on this target. if (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N0.getValueType())) { SDValue Temp, SCC; // cast from setcc result type to select result type if (LegalTypes) { SCC = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC); if (N2.getValueType().bitsLT(SCC.getValueType())) Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), N2.getValueType()); else Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), N2.getValueType(), SCC); } else { SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC); Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), N2.getValueType(), SCC); } AddToWorklist(SCC.getNode()); AddToWorklist(Temp.getNode()); if (N2C->isOne()) return Temp; // shl setcc result by log2 n2c return DAG.getNode( ISD::SHL, DL, N2.getValueType(), Temp, DAG.getConstant(N2C->getAPIntValue().logBase2(), SDLoc(Temp), getShiftAmountTy(Temp.getValueType()))); } } // Check to see if this is an integer abs. // select_cc setg[te] X, 0, X, -X -> // select_cc setgt X, -1, X, -X -> // select_cc setl[te] X, 0, -X, X -> // select_cc setlt X, 1, -X, X -> // Y = sra (X, size(X)-1); xor (add (X, Y), Y) if (N1C) { ConstantSDNode *SubC = nullptr; if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) || (N1C->isAllOnesValue() && CC == ISD::SETGT)) && N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1)) SubC = dyn_cast(N3.getOperand(0)); else if (((N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE)) || (N1C->isOne() && CC == ISD::SETLT)) && N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1)) SubC = dyn_cast(N2.getOperand(0)); EVT XType = N0.getValueType(); if (SubC && SubC->isNullValue() && XType.isInteger()) { SDLoc DL(N0); SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, DAG.getConstant(XType.getSizeInBits() - 1, DL, getShiftAmountTy(N0.getValueType()))); SDValue Add = DAG.getNode(ISD::ADD, DL, XType, N0, Shift); AddToWorklist(Shift.getNode()); AddToWorklist(Add.getNode()); return DAG.getNode(ISD::XOR, DL, XType, Add, Shift); } } // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X) // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X) // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X) // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X) // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X) // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X) // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X) // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X) if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { SDValue ValueOnZero = N2; SDValue Count = N3; // If the condition is NE instead of E, swap the operands. if (CC == ISD::SETNE) std::swap(ValueOnZero, Count); // Check if the value on zero is a constant equal to the bits in the type. if (auto *ValueOnZeroC = dyn_cast(ValueOnZero)) { if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) { // If the other operand is cttz/cttz_zero_undef of N0, and cttz is // legal, combine to just cttz. if ((Count.getOpcode() == ISD::CTTZ || Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) && N0 == Count.getOperand(0) && (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT))) return DAG.getNode(ISD::CTTZ, DL, VT, N0); // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is // legal, combine to just ctlz. if ((Count.getOpcode() == ISD::CTLZ || Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) && N0 == Count.getOperand(0) && (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT))) return DAG.getNode(ISD::CTLZ, DL, VT, N0); } } } return SDValue(); } /// This is a stub for TargetLowering::SimplifySetCC. SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, bool foldBooleans) { TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, Level, false, this); return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL); } /// Given an ISD::SDIV node expressing a divide by constant, return /// a DAG expression to select that will generate the same value by multiplying /// by a magic number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". SDValue DAGCombiner::BuildSDIV(SDNode *N) { // when optimising for minimum size, we don't want to expand a div to a mul // and a shift. if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); if (!C) return SDValue(); // Avoid division by zero. if (C->isNullValue()) return SDValue(); SmallVector Built; SDValue S = TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, Built); for (SDNode *N : Built) AddToWorklist(N); return S; } /// Given an ISD::SDIV node expressing a divide by constant power of 2, return a /// DAG expression that will generate the same value by right shifting. SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) { ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); if (!C) return SDValue(); // Avoid division by zero. if (C->isNullValue()) return SDValue(); SmallVector Built; SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, Built); for (SDNode *N : Built) AddToWorklist(N); return S; } /// Given an ISD::UDIV node expressing a divide by constant, return a DAG /// expression that will generate the same value by multiplying by a magic /// number. /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide". SDValue DAGCombiner::BuildUDIV(SDNode *N) { // when optimising for minimum size, we don't want to expand a div to a mul // and a shift. if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1)); if (!C) return SDValue(); // Avoid division by zero. if (C->isNullValue()) return SDValue(); SmallVector Built; SDValue S = TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, Built); for (SDNode *N : Built) AddToWorklist(N); return S; } /// Determines the LogBase2 value for a non-null input value using the /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V). SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) { EVT VT = V.getValueType(); unsigned EltBits = VT.getScalarSizeInBits(); SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V); SDValue Base = DAG.getConstant(EltBits - 1, DL, VT); SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz); return LogBase2; } /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) /// For the reciprocal, we need to find the zero of the function: /// F(X) = A X - 1 [which has a zero at X = 1/A] /// => /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form /// does not require additional intermediate precision] SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) { if (Level >= AfterLegalizeDAG) return SDValue(); // TODO: Handle half and/or extended types? EVT VT = Op.getValueType(); if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) return SDValue(); // If estimates are explicitly disabled for this function, we're done. MachineFunction &MF = DAG.getMachineFunction(); int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF); if (Enabled == TLI.ReciprocalEstimate::Disabled) return SDValue(); // Estimates may be explicitly enabled for this type with a custom number of // refinement steps. int Iterations = TLI.getDivRefinementSteps(VT, MF); if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) { AddToWorklist(Est.getNode()); if (Iterations) { EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue FPOne = DAG.getConstantFP(1.0, DL, VT); // Newton iterations: Est = Est + Est (1 - Arg * Est) for (int i = 0; i < Iterations; ++i) { SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); AddToWorklist(NewEst.getNode()); Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags); AddToWorklist(Est.getNode()); } } return Est; } return SDValue(); } /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) /// For the reciprocal sqrt, we need to find the zero of the function: /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] /// => /// X_{i+1} = X_i (1.5 - A X_i^2 / 2) /// As a result, we precompute A/2 prior to the iteration loop. SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations, SDNodeFlags Flags, bool Reciprocal) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT); // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that // this entire sequence requires only one FP constant. SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags); AddToWorklist(HalfArg.getNode()); HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags); AddToWorklist(HalfArg.getNode()); // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est) for (unsigned i = 0; i < Iterations; ++i) { SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags); AddToWorklist(NewEst.getNode()); NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags); AddToWorklist(NewEst.getNode()); Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags); AddToWorklist(Est.getNode()); } // If non-reciprocal square root is requested, multiply the result by Arg. if (!Reciprocal) { Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags); AddToWorklist(Est.getNode()); } return Est; } /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i) /// For the reciprocal sqrt, we need to find the zero of the function: /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)] /// => /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0)) SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations, SDNodeFlags Flags, bool Reciprocal) { EVT VT = Arg.getValueType(); SDLoc DL(Arg); SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT); SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT); // This routine must enter the loop below to work correctly // when (Reciprocal == false). assert(Iterations > 0); // Newton iterations for reciprocal square root: // E = (E * -0.5) * ((A * E) * E + -3.0) for (unsigned i = 0; i < Iterations; ++i) { SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags); AddToWorklist(AE.getNode()); SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags); AddToWorklist(AEE.getNode()); SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags); AddToWorklist(RHS.getNode()); // When calculating a square root at the last iteration build: // S = ((A * E) * -0.5) * ((A * E) * E + -3.0) // (notice a common subexpression) SDValue LHS; if (Reciprocal || (i + 1) < Iterations) { // RSQRT: LHS = (E * -0.5) LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags); } else { // SQRT: LHS = (A * E) * -0.5 LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags); } AddToWorklist(LHS.getNode()); Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags); AddToWorklist(Est.getNode()); } return Est; } /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if /// Op can be zero. SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Reciprocal) { if (Level >= AfterLegalizeDAG) return SDValue(); // TODO: Handle half and/or extended types? EVT VT = Op.getValueType(); if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64) return SDValue(); // If estimates are explicitly disabled for this function, we're done. MachineFunction &MF = DAG.getMachineFunction(); int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF); if (Enabled == TLI.ReciprocalEstimate::Disabled) return SDValue(); // Estimates may be explicitly enabled for this type with a custom number of // refinement steps. int Iterations = TLI.getSqrtRefinementSteps(VT, MF); bool UseOneConstNR = false; if (SDValue Est = TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR, Reciprocal)) { AddToWorklist(Est.getNode()); if (Iterations) { Est = UseOneConstNR ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal) : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal); if (!Reciprocal) { // The estimate is now completely wrong if the input was exactly 0.0 or // possibly a denormal. Force the answer to 0.0 for those cases. EVT VT = Op.getValueType(); SDLoc DL(Op); EVT CCVT = getSetCCResultType(VT); ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; const Function &F = DAG.getMachineFunction().getFunction(); Attribute Denorms = F.getFnAttribute("denormal-fp-math"); if (Denorms.getValueAsString().equals("ieee")) { // fabs(X) < SmallestNormal ? 0.0 : Est const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT); APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem); SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT); SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op); SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT); Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est); AddToWorklist(Fabs.getNode()); AddToWorklist(IsDenorm.getNode()); AddToWorklist(Est.getNode()); } else { // X == 0.0 ? 0.0 : Est SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ); Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est); AddToWorklist(IsZero.getNode()); AddToWorklist(Est.getNode()); } } } return Est; } return SDValue(); } SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) { return buildSqrtEstimateImpl(Op, Flags, true); } SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) { return buildSqrtEstimateImpl(Op, Flags, false); } /// Return true if there is any possibility that the two addresses overlap. bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { // If they are the same then they must be aliases. if (Op0->getBasePtr() == Op1->getBasePtr()) return true; // If they are both volatile then they cannot be reordered. if (Op0->isVolatile() && Op1->isVolatile()) return true; // If one operation reads from invariant memory, and the other may store, they // cannot alias. These should really be checking the equivalent of mayWrite, // but it only matters for memory nodes other than load /store. if (Op0->isInvariant() && Op1->writeMem()) return false; if (Op1->isInvariant() && Op0->writeMem()) return false; unsigned NumBytes0 = Op0->getMemoryVT().getStoreSize(); unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize(); // Check for BaseIndexOffset matching. BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0, DAG); BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1, DAG); int64_t PtrDiff; if (BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()) { if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff)) return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0)); // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be // able to calculate their relative offset if at least one arises // from an alloca. However, these allocas cannot overlap and we // can infer there is no alias. if (auto *A = dyn_cast(BasePtr0.getBase())) if (auto *B = dyn_cast(BasePtr1.getBase())) { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); // If the base are the same frame index but the we couldn't find a // constant offset, (indices are different) be conservative. if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) || !MFI.isFixedObjectIndex(B->getIndex()))) return false; } bool IsFI0 = isa(BasePtr0.getBase()); bool IsFI1 = isa(BasePtr1.getBase()); bool IsGV0 = isa(BasePtr0.getBase()); bool IsGV1 = isa(BasePtr1.getBase()); bool IsCV0 = isa(BasePtr0.getBase()); bool IsCV1 = isa(BasePtr1.getBase()); // If of mismatched base types or checkable indices we can check // they do not alias. if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) || (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) && (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1)) return false; } // If we know required SrcValue1 and SrcValue2 have relatively large // alignment compared to the size and offset of the access, we may be able // to prove they do not alias. This check is conservative for now to catch // cases created by splitting vector types. int64_t SrcValOffset0 = Op0->getSrcValueOffset(); int64_t SrcValOffset1 = Op1->getSrcValueOffset(); unsigned OrigAlignment0 = Op0->getOriginalAlignment(); unsigned OrigAlignment1 = Op1->getOriginalAlignment(); if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 && NumBytes0 == NumBytes1 && OrigAlignment0 > NumBytes0) { int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0; int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1; // There is no overlap between these relatively aligned accesses of // similar size. Return no alias. if ((OffAlign0 + NumBytes0) <= OffAlign1 || (OffAlign1 + NumBytes1) <= OffAlign0) return false; } bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 ? CombinerGlobalAA : DAG.getSubtarget().useAA(); #ifndef NDEBUG if (CombinerAAOnlyFunc.getNumOccurrences() && CombinerAAOnlyFunc != DAG.getMachineFunction().getName()) UseAA = false; #endif if (UseAA && AA && Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) { // Use alias analysis information. int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1); int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset; int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset; AliasResult AAResult = AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0, UseTBAA ? Op0->getAAInfo() : AAMDNodes()), MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1, UseTBAA ? Op1->getAAInfo() : AAMDNodes()) ); if (AAResult == NoAlias) return false; } // Otherwise we have to assume they alias. return true; } /// Walk up chain skipping non-aliasing memory nodes, /// looking for aliasing nodes and adding them to the Aliases vector. void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain, SmallVectorImpl &Aliases) { SmallVector Chains; // List of chains to visit. SmallPtrSet Visited; // Visited node set. // Get alias information for node. bool IsLoad = isa(N) && !cast(N)->isVolatile(); // Starting off. Chains.push_back(OriginalChain); unsigned Depth = 0; // Look at each chain and determine if it is an alias. If so, add it to the // aliases list. If not, then continue up the chain looking for the next // candidate. while (!Chains.empty()) { SDValue Chain = Chains.pop_back_val(); // For TokenFactor nodes, look at each operand and only continue up the // chain until we reach the depth limit. // // FIXME: The depth check could be made to return the last non-aliasing // chain we found before we hit a tokenfactor rather than the original // chain. if (Depth > TLI.getGatherAllAliasesMaxDepth()) { Aliases.clear(); Aliases.push_back(OriginalChain); return; } // Don't bother if we've been before. if (!Visited.insert(Chain.getNode()).second) continue; switch (Chain.getOpcode()) { case ISD::EntryToken: // Entry token is ideal chain operand, but handled in FindBetterChain. break; case ISD::LOAD: case ISD::STORE: { // Get alias information for Chain. bool IsOpLoad = isa(Chain.getNode()) && !cast(Chain.getNode())->isVolatile(); // If chain is alias then stop here. if (!(IsLoad && IsOpLoad) && isAlias(cast(N), cast(Chain.getNode()))) { Aliases.push_back(Chain); } else { // Look further up the chain. Chains.push_back(Chain.getOperand(0)); ++Depth; } break; } case ISD::TokenFactor: // We have to check each of the operands of the token factor for "small" // token factors, so we queue them up. Adding the operands to the queue // (stack) in reverse order maintains the original order and increases the // likelihood that getNode will find a matching token factor (CSE.) if (Chain.getNumOperands() > 16) { Aliases.push_back(Chain); break; } for (unsigned n = Chain.getNumOperands(); n;) Chains.push_back(Chain.getOperand(--n)); ++Depth; break; case ISD::CopyFromReg: // Forward past CopyFromReg. Chains.push_back(Chain.getOperand(0)); ++Depth; break; default: // For all other instructions we will just have to take what we can get. Aliases.push_back(Chain); break; } } } /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain /// (aliasing node.) SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) { if (OptLevel == CodeGenOpt::None) return OldChain; // Ops for replacing token factor. SmallVector Aliases; // Accumulate all the aliases to this node. GatherAllAliases(N, OldChain, Aliases); // If no operands then chain to entry token. if (Aliases.size() == 0) return DAG.getEntryNode(); // If a single operand then chain to it. We don't need to revisit it. if (Aliases.size() == 1) return Aliases[0]; // Construct a custom tailored token factor. return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases); } // This function tries to collect a bunch of potentially interesting // nodes to improve the chains of, all at once. This might seem // redundant, as this function gets called when visiting every store // node, so why not let the work be done on each store as it's visited? // // I believe this is mainly important because MergeConsecutiveStores // is unable to deal with merging stores of different sizes, so unless // we improve the chains of all the potential candidates up-front // before running MergeConsecutiveStores, it might only see some of // the nodes that will eventually be candidates, and then not be able // to go from a partially-merged state to the desired final // fully-merged state. bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { if (OptLevel == CodeGenOpt::None) return false; // This holds the base pointer, index, and the offset in bytes from the base // pointer. BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG); // We must have a base and an offset. if (!BasePtr.getBase().getNode()) return false; // Do not handle stores to undef base pointers. if (BasePtr.getBase().isUndef()) return false; SmallVector ChainedStores; ChainedStores.push_back(St); // Walk up the chain and look for nodes with offsets from the same // base pointer. Stop when reaching an instruction with a different kind // or instruction which has a different base pointer. StoreSDNode *Index = St; while (Index) { // If the chain has more than one use, then we can't reorder the mem ops. if (Index != St && !SDValue(Index, 0)->hasOneUse()) break; if (Index->isVolatile() || Index->isIndexed()) break; // Find the base pointer and offset for this memory node. BaseIndexOffset Ptr = BaseIndexOffset::match(Index, DAG); // Check that the base pointer is the same as the original one. if (!BasePtr.equalBaseIndex(Ptr, DAG)) break; // Walk up the chain to find the next store node, ignoring any // intermediate loads. Any other kind of node will halt the loop. SDNode *NextInChain = Index->getChain().getNode(); while (true) { if (StoreSDNode *STn = dyn_cast(NextInChain)) { // We found a store node. Use it for the next iteration. if (STn->isVolatile() || STn->isIndexed()) { Index = nullptr; break; } ChainedStores.push_back(STn); Index = STn; break; } else if (LoadSDNode *Ldn = dyn_cast(NextInChain)) { NextInChain = Ldn->getChain().getNode(); continue; } else { Index = nullptr; break; } }// end while } // At this point, ChainedStores lists all of the Store nodes // reachable by iterating up through chain nodes matching the above // conditions. For each such store identified, try to find an // earlier chain to attach the store to which won't violate the // required ordering. bool MadeChangeToSt = false; SmallVector, 8> BetterChains; for (StoreSDNode *ChainedStore : ChainedStores) { SDValue Chain = ChainedStore->getChain(); SDValue BetterChain = FindBetterChain(ChainedStore, Chain); if (Chain != BetterChain) { if (ChainedStore == St) MadeChangeToSt = true; BetterChains.push_back(std::make_pair(ChainedStore, BetterChain)); } } // Do all replacements after finding the replacements to make to avoid making // the chains more complicated by introducing new TokenFactors. for (auto Replacement : BetterChains) replaceStoreChain(Replacement.first, Replacement.second); return MadeChangeToSt; } /// This is the entry point for the file. void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA, CodeGenOpt::Level OptLevel) { /// This is the main entry point to this class. DAGCombiner(*this, AA, OptLevel).Run(Level); } Index: head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 344055) +++ head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 344056) @@ -1,11574 +1,11576 @@ //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file implements the AArch64TargetLowering class. // //===----------------------------------------------------------------------===// #include "AArch64ISelLowering.h" #include "AArch64CallingConvention.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64PerfectShuffle.h" #include "AArch64RegisterInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetCallingConv.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include #include #include #include #include #include #include #include using namespace llvm; #define DEBUG_TYPE "aarch64-lower" STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumShiftInserts, "Number of vector shift inserts"); STATISTIC(NumOptimizedImms, "Number of times immediates were optimized"); static cl::opt EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, cl::desc("Allow AArch64 SLI/SRI formation"), cl::init(false)); // FIXME: The necessary dtprel relocations don't seem to be supported // well in the GNU bfd and gold linkers at the moment. Therefore, by // default, for now, fall back to GeneralDynamic code generation. cl::opt EnableAArch64ELFLocalDynamicTLSGeneration( "aarch64-elf-ldtls-generation", cl::Hidden, cl::desc("Allow AArch64 Local Dynamic TLS code generation"), cl::init(false)); static cl::opt EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden, cl::desc("Enable AArch64 logical imm instruction " "optimization"), cl::init(true)); /// Value type used for condition codes. static const MVT MVT_CC = MVT::i32; AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI) : TargetLowering(TM), Subtarget(&STI) { // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so // we have to make something up. Arbitrarily, choose ZeroOrOne. setBooleanContents(ZeroOrOneBooleanContent); // When comparing vectors the result sets the different elements in the // vector to all-one or all-zero. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // Set up the register classes. addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); if (Subtarget->hasFPARMv8()) { addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); } if (Subtarget->hasNEON()) { addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); // Someone set us up the NEON. addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); addDRTypeForNEON(MVT::v4i16); addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); addDRTypeForNEON(MVT::v1f64); addDRTypeForNEON(MVT::v4f16); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); addQRTypeForNEON(MVT::v16i8); addQRTypeForNEON(MVT::v8i16); addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); addQRTypeForNEON(MVT::v8f16); } // Compute derived properties from the register classes computeRegisterProperties(Subtarget->getRegisterInfo()); // Provide all sorts of operation actions setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::i32, Custom); setOperationAction(ISD::SETCC, MVT::i64, Custom); setOperationAction(ISD::SETCC, MVT::f16, Custom); setOperationAction(ISD::SETCC, MVT::f32, Custom); setOperationAction(ISD::SETCC, MVT::f64, Custom); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); setOperationAction(ISD::BRCOND, MVT::Other, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Custom); setOperationAction(ISD::BR_CC, MVT::i64, Custom); setOperationAction(ISD::BR_CC, MVT::f16, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Custom); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::i64, Custom); setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); setOperationAction(ISD::BR_JT, MVT::Other, Expand); setOperationAction(ISD::JumpTable, MVT::i64, Custom); setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); setOperationAction(ISD::FREM, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f80, Expand); setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand); // Custom lowering hooks are needed for XOR // to fold it into CSINC/CSINV. setOperationAction(ISD::XOR, MVT::i32, Custom); setOperationAction(ISD::XOR, MVT::i64, Custom); // Virtually no operation on f128 is legal, but LLVM can't expand them when // there's a valid register class, so we need custom operations in most cases. setOperationAction(ISD::FABS, MVT::f128, Expand); setOperationAction(ISD::FADD, MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); setOperationAction(ISD::FCOS, MVT::f128, Expand); setOperationAction(ISD::FDIV, MVT::f128, Custom); setOperationAction(ISD::FMA, MVT::f128, Expand); setOperationAction(ISD::FMUL, MVT::f128, Custom); setOperationAction(ISD::FNEG, MVT::f128, Expand); setOperationAction(ISD::FPOW, MVT::f128, Expand); setOperationAction(ISD::FREM, MVT::f128, Expand); setOperationAction(ISD::FRINT, MVT::f128, Expand); setOperationAction(ISD::FSIN, MVT::f128, Expand); setOperationAction(ISD::FSINCOS, MVT::f128, Expand); setOperationAction(ISD::FSQRT, MVT::f128, Expand); setOperationAction(ISD::FSUB, MVT::f128, Custom); setOperationAction(ISD::FTRUNC, MVT::f128, Expand); setOperationAction(ISD::SETCC, MVT::f128, Custom); setOperationAction(ISD::BR_CC, MVT::f128, Custom); setOperationAction(ISD::SELECT, MVT::f128, Custom); setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); // Lowering for many of the conversions is actually specified by the non-f128 // type. The LowerXXX function will be trivial when f128 isn't involved. setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); // Variable arguments. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Custom); setOperationAction(ISD::VACOPY, MVT::Other, Custom); setOperationAction(ISD::VAEND, MVT::Other, Expand); // Variable-sized objects. setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); if (Subtarget->isTargetWindows()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); // Constant pool entries setOperationAction(ISD::ConstantPool, MVT::i64, Custom); // BlockAddress setOperationAction(ISD::BlockAddress, MVT::i64, Custom); // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. setOperationAction(ISD::ADDC, MVT::i32, Custom); setOperationAction(ISD::ADDE, MVT::i32, Custom); setOperationAction(ISD::SUBC, MVT::i32, Custom); setOperationAction(ISD::SUBE, MVT::i32, Custom); setOperationAction(ISD::ADDC, MVT::i64, Custom); setOperationAction(ISD::ADDE, MVT::i64, Custom); setOperationAction(ISD::SUBC, MVT::i64, Custom); setOperationAction(ISD::SUBE, MVT::i64, Custom); // AArch64 lacks both left-rotate and popcount instructions. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } // AArch64 doesn't have {U|S}MUL_LOHI. setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::CTPOP, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i64, Custom); setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::SDIVREM, MVT::i64, Expand); for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::SREM, MVT::i64, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); // Custom lower Add/Sub/Mul with overflow. setOperationAction(ISD::SADDO, MVT::i32, Custom); setOperationAction(ISD::SADDO, MVT::i64, Custom); setOperationAction(ISD::UADDO, MVT::i32, Custom); setOperationAction(ISD::UADDO, MVT::i64, Custom); setOperationAction(ISD::SSUBO, MVT::i32, Custom); setOperationAction(ISD::SSUBO, MVT::i64, Custom); setOperationAction(ISD::USUBO, MVT::i32, Custom); setOperationAction(ISD::USUBO, MVT::i64, Custom); setOperationAction(ISD::SMULO, MVT::i32, Custom); setOperationAction(ISD::SMULO, MVT::i64, Custom); setOperationAction(ISD::UMULO, MVT::i32, Custom); setOperationAction(ISD::UMULO, MVT::i64, Custom); setOperationAction(ISD::FSIN, MVT::f32, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); if (Subtarget->hasFullFP16()) setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom); else setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote); setOperationAction(ISD::FREM, MVT::f16, Promote); setOperationAction(ISD::FREM, MVT::v4f16, Promote); setOperationAction(ISD::FREM, MVT::v8f16, Promote); setOperationAction(ISD::FPOW, MVT::f16, Promote); setOperationAction(ISD::FPOW, MVT::v4f16, Promote); setOperationAction(ISD::FPOW, MVT::v8f16, Promote); setOperationAction(ISD::FPOWI, MVT::f16, Promote); setOperationAction(ISD::FCOS, MVT::f16, Promote); setOperationAction(ISD::FCOS, MVT::v4f16, Promote); setOperationAction(ISD::FCOS, MVT::v8f16, Promote); setOperationAction(ISD::FSIN, MVT::f16, Promote); setOperationAction(ISD::FSIN, MVT::v4f16, Promote); setOperationAction(ISD::FSIN, MVT::v8f16, Promote); setOperationAction(ISD::FSINCOS, MVT::f16, Promote); setOperationAction(ISD::FSINCOS, MVT::v4f16, Promote); setOperationAction(ISD::FSINCOS, MVT::v8f16, Promote); setOperationAction(ISD::FEXP, MVT::f16, Promote); setOperationAction(ISD::FEXP, MVT::v4f16, Promote); setOperationAction(ISD::FEXP, MVT::v8f16, Promote); setOperationAction(ISD::FEXP2, MVT::f16, Promote); setOperationAction(ISD::FEXP2, MVT::v4f16, Promote); setOperationAction(ISD::FEXP2, MVT::v8f16, Promote); setOperationAction(ISD::FLOG, MVT::f16, Promote); setOperationAction(ISD::FLOG, MVT::v4f16, Promote); setOperationAction(ISD::FLOG, MVT::v8f16, Promote); setOperationAction(ISD::FLOG2, MVT::f16, Promote); setOperationAction(ISD::FLOG2, MVT::v4f16, Promote); setOperationAction(ISD::FLOG2, MVT::v8f16, Promote); setOperationAction(ISD::FLOG10, MVT::f16, Promote); setOperationAction(ISD::FLOG10, MVT::v4f16, Promote); setOperationAction(ISD::FLOG10, MVT::v8f16, Promote); if (!Subtarget->hasFullFP16()) { setOperationAction(ISD::SELECT, MVT::f16, Promote); setOperationAction(ISD::SELECT_CC, MVT::f16, Promote); setOperationAction(ISD::SETCC, MVT::f16, Promote); setOperationAction(ISD::BR_CC, MVT::f16, Promote); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); setOperationAction(ISD::FDIV, MVT::f16, Promote); setOperationAction(ISD::FMA, MVT::f16, Promote); setOperationAction(ISD::FNEG, MVT::f16, Promote); setOperationAction(ISD::FABS, MVT::f16, Promote); setOperationAction(ISD::FCEIL, MVT::f16, Promote); setOperationAction(ISD::FSQRT, MVT::f16, Promote); setOperationAction(ISD::FFLOOR, MVT::f16, Promote); setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote); setOperationAction(ISD::FRINT, MVT::f16, Promote); setOperationAction(ISD::FROUND, MVT::f16, Promote); setOperationAction(ISD::FTRUNC, MVT::f16, Promote); setOperationAction(ISD::FMINNUM, MVT::f16, Promote); setOperationAction(ISD::FMAXNUM, MVT::f16, Promote); setOperationAction(ISD::FMINNAN, MVT::f16, Promote); setOperationAction(ISD::FMAXNAN, MVT::f16, Promote); // promote v4f16 to v4f32 when that is known to be safe. setOperationAction(ISD::FADD, MVT::v4f16, Promote); setOperationAction(ISD::FSUB, MVT::v4f16, Promote); setOperationAction(ISD::FMUL, MVT::v4f16, Promote); setOperationAction(ISD::FDIV, MVT::v4f16, Promote); setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); setOperationAction(ISD::FABS, MVT::v4f16, Expand); setOperationAction(ISD::FNEG, MVT::v4f16, Expand); setOperationAction(ISD::FROUND, MVT::v4f16, Expand); setOperationAction(ISD::FMA, MVT::v4f16, Expand); setOperationAction(ISD::SETCC, MVT::v4f16, Expand); setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); setOperationAction(ISD::SELECT, MVT::v4f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); setOperationAction(ISD::FRINT, MVT::v4f16, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); setOperationAction(ISD::FABS, MVT::v8f16, Expand); setOperationAction(ISD::FADD, MVT::v8f16, Expand); setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); setOperationAction(ISD::FDIV, MVT::v8f16, Expand); setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); setOperationAction(ISD::FMA, MVT::v8f16, Expand); setOperationAction(ISD::FMUL, MVT::v8f16, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); setOperationAction(ISD::FNEG, MVT::v8f16, Expand); setOperationAction(ISD::FROUND, MVT::v8f16, Expand); setOperationAction(ISD::FRINT, MVT::v8f16, Expand); setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); setOperationAction(ISD::FSUB, MVT::v8f16, Expand); setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); setOperationAction(ISD::SETCC, MVT::v8f16, Expand); setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); setOperationAction(ISD::SELECT, MVT::v8f16, Expand); setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); } // AArch64 has implementations of a lot of rounding-like FP operations. for (MVT Ty : {MVT::f32, MVT::f64}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); setOperationAction(ISD::FMINNUM, Ty, Legal); setOperationAction(ISD::FMAXNUM, Ty, Legal); setOperationAction(ISD::FMINNAN, Ty, Legal); setOperationAction(ISD::FMAXNAN, Ty, Legal); } if (Subtarget->hasFullFP16()) { setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal); setOperationAction(ISD::FFLOOR, MVT::f16, Legal); setOperationAction(ISD::FCEIL, MVT::f16, Legal); setOperationAction(ISD::FRINT, MVT::f16, Legal); setOperationAction(ISD::FTRUNC, MVT::f16, Legal); setOperationAction(ISD::FROUND, MVT::f16, Legal); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); setOperationAction(ISD::FMINNAN, MVT::f16, Legal); setOperationAction(ISD::FMAXNAN, MVT::f16, Legal); } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { // Issue __sincos_stret if available. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } else { setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } // Make floating-point constants legal for the large code model, so they don't // become loads from the constant pool. if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { setOperationAction(ISD::ConstantFP, MVT::f32, Legal); setOperationAction(ISD::ConstantFP, MVT::f64, Legal); } // AArch64 does not have floating-point extending loads, i1 sign-extending // load, floating-point truncating stores, or v2i32->v2i16 truncating store. for (MVT VT : MVT::fp_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); } for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f128, MVT::f80, Expand); setTruncStoreAction(MVT::f128, MVT::f64, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f16, Expand); setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); // Indexed loads and stores are supported. for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, MVT::i8, Legal); setIndexedLoadAction(im, MVT::i16, Legal); setIndexedLoadAction(im, MVT::i32, Legal); setIndexedLoadAction(im, MVT::i64, Legal); setIndexedLoadAction(im, MVT::f64, Legal); setIndexedLoadAction(im, MVT::f32, Legal); setIndexedLoadAction(im, MVT::f16, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i64, Legal); setIndexedStoreAction(im, MVT::f64, Legal); setIndexedStoreAction(im, MVT::f32, Legal); setIndexedStoreAction(im, MVT::f16, Legal); } // Trap. setOperationAction(ISD::TRAP, MVT::Other, Legal); // We combine OR nodes for bitfield operations. setTargetDAGCombine(ISD::OR); // Vector add and sub nodes may conceal a high-half opportunity. // Also, try to fold ADD into CSINC/CSINV.. setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::XOR); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::UINT_TO_FP); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); if (Subtarget->supportsAddressTopByteIgnored()) setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::GlobalAddress); // In case of strict alignment, avoid an excessive number of byte wide stores. MaxStoresPerMemsetOptSize = 8; MaxStoresPerMemset = Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32; MaxGluedStoresPerMemcpy = 4; MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemcpy = Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16; MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4; setStackPointerRegisterToSaveRestore(AArch64::SP); setSchedulingPreference(Sched::Hybrid); EnableExtLdPromotion = true; // Set required alignment. setMinFunctionAlignment(2); // Set preferred alignments. setPrefFunctionAlignment(STI.getPrefFunctionAlignment()); setPrefLoopAlignment(STI.getPrefLoopAlignment()); // Only change the limit for entries in a jump table if specified by // the subtarget, but not at the command line. unsigned MaxJT = STI.getMaximumJumpTableSize(); if (MaxJT && getMaximumJumpTableSize() == 0) setMaximumJumpTableSize(MaxJT); setHasExtractBitsInsn(true); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); if (Subtarget->hasNEON()) { // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to // silliness like this: setOperationAction(ISD::FABS, MVT::v1f64, Expand); setOperationAction(ISD::FADD, MVT::v1f64, Expand); setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); setOperationAction(ISD::FCOS, MVT::v1f64, Expand); setOperationAction(ISD::FDIV, MVT::v1f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); setOperationAction(ISD::FMA, MVT::v1f64, Expand); setOperationAction(ISD::FMUL, MVT::v1f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); setOperationAction(ISD::FNEG, MVT::v1f64, Expand); setOperationAction(ISD::FPOW, MVT::v1f64, Expand); setOperationAction(ISD::FREM, MVT::v1f64, Expand); setOperationAction(ISD::FROUND, MVT::v1f64, Expand); setOperationAction(ISD::FRINT, MVT::v1f64, Expand); setOperationAction(ISD::FSIN, MVT::v1f64, Expand); setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); setOperationAction(ISD::FSUB, MVT::v1f64, Expand); setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); setOperationAction(ISD::SETCC, MVT::v1f64, Expand); setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); setOperationAction(ISD::SELECT, MVT::v1f64, Expand); setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); setOperationAction(ISD::MUL, MVT::v1i64, Expand); // AArch64 doesn't have a direct vector ->f32 conversion instructions for // elements smaller than i32, so promote the input to i32 first. setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32); setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32); // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16 // -> v8f16 conversions. setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32); setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32); setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32); setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32); // Similarly, there is no direct i32 -> f64 vector conversion instruction. setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); // Or, direct i32 -> f16 vector conversion. Set it so custom, so the // conversion happens in two steps: v4i32 -> v4f32 -> v4f16 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); setOperationAction(ISD::CTTZ, MVT::v2i8, Expand); setOperationAction(ISD::CTTZ, MVT::v4i16, Expand); setOperationAction(ISD::CTTZ, MVT::v2i32, Expand); setOperationAction(ISD::CTTZ, MVT::v1i64, Expand); setOperationAction(ISD::CTTZ, MVT::v16i8, Expand); setOperationAction(ISD::CTTZ, MVT::v8i16, Expand); setOperationAction(ISD::CTTZ, MVT::v4i32, Expand); setOperationAction(ISD::CTTZ, MVT::v2i64, Expand); // AArch64 doesn't have MUL.2d: setOperationAction(ISD::MUL, MVT::v2i64, Expand); // Custom handling for some quad-vector types to detect MULL. setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); // Vector reductions for (MVT VT : MVT::integer_valuetypes()) { setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); } for (MVT VT : MVT::fp_valuetypes()) { setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom); } setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); // Likewise, narrowing and extending vector loads/stores aren't handled // directly. for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) { setOperationAction(ISD::MULHS, VT, Custom); setOperationAction(ISD::MULHU, VT, Custom); } else { setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); } setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } } // AArch64 has implementations of a lot of rounding-like FP operations. for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) { setOperationAction(ISD::FFLOOR, Ty, Legal); setOperationAction(ISD::FNEARBYINT, Ty, Legal); setOperationAction(ISD::FCEIL, Ty, Legal); setOperationAction(ISD::FRINT, Ty, Legal); setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); } setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); } void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) { assert(VT.isVector() && "VT should be a vector type"); if (VT.isFloatingPoint()) { MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT(); setOperationPromotedToType(ISD::LOAD, VT, PromoteTo); setOperationPromotedToType(ISD::STORE, VT, PromoteTo); } // Mark vector float intrinsics as expand. if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); // But we do support custom-lowering for FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, VT, Custom); } setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::AND, VT, Custom); setOperationAction(ISD::OR, VT, Custom); setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); for (MVT InnerVT : MVT::all_valuetypes()) setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); // CNT supports only B element sizes. if (VT != MVT::v8i8 && VT != MVT::v16i8) setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); if (!VT.isFloatingPoint()) setOperationAction(ISD::ABS, VT, Legal); // [SU][MIN|MAX] are available for all NEON types apart from i64. if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); // F[MIN|MAX][NUM|NAN] are available for all FP NEON types. if (VT.isFloatingPoint() && (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16())) for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN, ISD::FMINNUM, ISD::FMAXNUM}) setOperationAction(Opcode, VT, Legal); if (Subtarget->isLittleEndian()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, VT, Legal); setIndexedStoreAction(im, VT, Legal); } } } void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR64RegClass); addTypeForNEON(VT, MVT::v2i32); } void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { addRegisterClass(VT, &AArch64::FPR128RegClass); addTypeForNEON(VT, MVT::v4i32); } EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &, EVT VT) const { if (!VT.isVector()) return MVT::i32; return VT.changeVectorElementTypeToInteger(); } static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc) { uint64_t OldImm = Imm, NewImm, Enc; uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask; // Return if the immediate is already all zeros, all ones, a bimm32 or a // bimm64. if (Imm == 0 || Imm == Mask || AArch64_AM::isLogicalImmediate(Imm & Mask, Size)) return false; unsigned EltSize = Size; uint64_t DemandedBits = Demanded.getZExtValue(); // Clear bits that are not demanded. Imm &= DemandedBits; while (true) { // The goal here is to set the non-demanded bits in a way that minimizes // the number of switching between 0 and 1. In order to achieve this goal, // we set the non-demanded bits to the value of the preceding demanded bits. // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a // non-demanded bit), we copy bit0 (1) to the least significant 'x', // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'. // The final result is 0b11000011. uint64_t NonDemandedBits = ~DemandedBits; uint64_t InvertedImm = ~Imm & DemandedBits; uint64_t RotatedImm = ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) & NonDemandedBits; uint64_t Sum = RotatedImm + NonDemandedBits; bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1)); uint64_t Ones = (Sum + Carry) & NonDemandedBits; NewImm = (Imm | Ones) & Mask; // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate // or all-ones or all-zeros, in which case we can stop searching. Otherwise, // we halve the element size and continue the search. if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask))) break; // We cannot shrink the element size any further if it is 2-bits. if (EltSize == 2) return false; EltSize /= 2; Mask >>= EltSize; uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize; // Return if there is mismatch in any of the demanded bits of Imm and Hi. if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0) return false; // Merge the upper and lower halves of Imm and DemandedBits. Imm |= Hi; DemandedBits |= DemandedBitsHi; } ++NumOptimizedImms; // Replicate the element across the register width. while (EltSize < Size) { NewImm |= NewImm << EltSize; EltSize *= 2; } (void)OldImm; assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 && "demanded bits should never be altered"); assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm"); // Create the new constant immediate node. EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue New; // If the new constant immediate is all-zeros or all-ones, let the target // independent DAG combine optimize this node. if (NewImm == 0 || NewImm == OrigMask) { New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0), TLO.DAG.getConstant(NewImm, DL, VT)); // Otherwise, create a machine node so that target independent DAG combine // doesn't undo this optimization. } else { Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size); SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT); New = SDValue( TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0); } return TLO.CombineTo(Op, New); } bool AArch64TargetLowering::targetShrinkDemandedConstant( SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const { // Delay this optimization to as late as possible. if (!TLO.LegalOps) return false; if (!EnableOptimizeLogicalImm) return false; EVT VT = Op.getValueType(); if (VT.isVector()) return false; unsigned Size = VT.getSizeInBits(); assert((Size == 32 || Size == 64) && "i32 or i64 is expected after legalization."); // Exit early if we demand all bits. if (Demanded.countPopulation() == Size) return false; unsigned NewOpc; switch (Op.getOpcode()) { default: return false; case ISD::AND: NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri; break; case ISD::OR: NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri; break; case ISD::XOR: NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri; break; } ConstantSDNode *C = dyn_cast(Op.getOperand(1)); if (!C) return false; uint64_t Imm = C->getZExtValue(); return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc); } /// computeKnownBitsForTargetNode - Determine which of the bits specified in /// Mask are known to be either zero or one and return them Known. void AArch64TargetLowering::computeKnownBitsForTargetNode( const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { switch (Op.getOpcode()) { default: break; case AArch64ISD::CSEL: { KnownBits Known2; DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1); DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1); Known.Zero &= Known2.Zero; Known.One &= Known2.One; break; } case ISD::INTRINSIC_W_CHAIN: { ConstantSDNode *CN = cast(Op->getOperand(1)); Intrinsic::ID IntID = static_cast(CN->getZExtValue()); switch (IntID) { default: return; case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { unsigned BitWidth = Known.getBitWidth(); EVT VT = cast(Op)->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } } break; } case ISD::INTRINSIC_WO_CHAIN: case ISD::INTRINSIC_VOID: { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); switch (IntNo) { default: break; case Intrinsic::aarch64_neon_umaxv: case Intrinsic::aarch64_neon_uminv: { // Figure out the datatype of the vector operand. The UMINV instruction // will zero extend the result, so we can mark as known zero all the // bits larger than the element datatype. 32-bit or larget doesn't need // this as those are legal types and will be handled by isel directly. MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); unsigned BitWidth = Known.getBitWidth(); if (VT == MVT::v8i8 || VT == MVT::v16i8) { assert(BitWidth >= 8 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); Known.Zero |= Mask; } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { assert(BitWidth >= 16 && "Unexpected width!"); APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); Known.Zero |= Mask; } break; } break; } } } } MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL, EVT) const { return MVT::i64; } bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, bool *Fast) const { if (Subtarget->requiresStrictAlign()) return false; if (Fast) { // Some CPUs are fine with unaligned stores except for 128-bit ones. *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 || // See comments in performSTORECombine() for more details about // these conditions. // Code that uses clang vector extensions can mark that it // wants unaligned accesses to be treated as fast by // underspecifying alignment to be 1 or 2. Align <= 2 || // Disregard v2i64. Memcpy lowering produces those and splitting // them regresses performance on micro-benchmarks and olden/bh. VT == MVT::v2i64; } return true; } FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return AArch64::createFastISel(funcInfo, libInfo); } const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; case AArch64ISD::CALL: return "AArch64ISD::CALL"; case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ"; case AArch64ISD::ADC: return "AArch64ISD::ADC"; case AArch64ISD::SBC: return "AArch64ISD::SBC"; case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; case AArch64ISD::CCMP: return "AArch64ISD::CCMP"; case AArch64ISD::CCMN: return "AArch64ISD::CCMN"; case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP"; case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; case AArch64ISD::DUP: return "AArch64ISD::DUP"; case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; case AArch64ISD::BICi: return "AArch64ISD::BICi"; case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; case AArch64ISD::BSL: return "AArch64ISD::BSL"; case AArch64ISD::NEG: return "AArch64ISD::NEG"; case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; case AArch64ISD::REV16: return "AArch64ISD::REV16"; case AArch64ISD::REV32: return "AArch64ISD::REV32"; case AArch64ISD::REV64: return "AArch64ISD::REV64"; case AArch64ISD::EXT: return "AArch64ISD::EXT"; case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; case AArch64ISD::SADDV: return "AArch64ISD::SADDV"; case AArch64ISD::UADDV: return "AArch64ISD::UADDV"; case AArch64ISD::SMINV: return "AArch64ISD::SMINV"; case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; case AArch64ISD::NOT: return "AArch64ISD::NOT"; case AArch64ISD::BIT: return "AArch64ISD::BIT"; case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH"; case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE"; case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS"; case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE"; case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS"; } return nullptr; } MachineBasicBlock * AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *MBB) const { // We materialise the F128CSEL pseudo-instruction as some control flow and a // phi node: // OrigBB: // [... previous instrs leading to comparison ...] // b.ne TrueBB // b EndBB // TrueBB: // ; Fallthrough // EndBB: // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = MBB->getBasicBlock(); DebugLoc DL = MI.getDebugLoc(); MachineFunction::iterator It = ++MBB->getIterator(); unsigned DestReg = MI.getOperand(0).getReg(); unsigned IfTrueReg = MI.getOperand(1).getReg(); unsigned IfFalseReg = MI.getOperand(2).getReg(); unsigned CondCode = MI.getOperand(3).getImm(); bool NZCVKilled = MI.getOperand(4).isKill(); MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, TrueBB); MF->insert(It, EndBB); // Transfer rest of current basic-block to EndBB EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); EndBB->transferSuccessorsAndUpdatePHIs(MBB); BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); MBB->addSuccessor(TrueBB); MBB->addSuccessor(EndBB); // TrueBB falls through to the end. TrueBB->addSuccessor(EndBB); if (!NZCVKilled) { TrueBB->addLiveIn(AArch64::NZCV); EndBB->addLiveIn(AArch64::NZCV); } BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) .addReg(IfTrueReg) .addMBB(TrueBB) .addReg(IfFalseReg) .addMBB(MBB); MI.eraseFromParent(); return EndBB; } MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { default: #ifndef NDEBUG MI.dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); } } //===----------------------------------------------------------------------===// // AArch64 Lowering private implementation. //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Lowering Code //===----------------------------------------------------------------------===// /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 /// CC static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { switch (CC) { default: llvm_unreachable("Unknown condition code!"); case ISD::SETNE: return AArch64CC::NE; case ISD::SETEQ: return AArch64CC::EQ; case ISD::SETGT: return AArch64CC::GT; case ISD::SETGE: return AArch64CC::GE; case ISD::SETLT: return AArch64CC::LT; case ISD::SETLE: return AArch64CC::LE; case ISD::SETUGT: return AArch64CC::HI; case ISD::SETUGE: return AArch64CC::HS; case ISD::SETULT: return AArch64CC::LO; case ISD::SETULE: return AArch64CC::LS; } } /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. static void changeFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2) { CondCode2 = AArch64CC::AL; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: case ISD::SETOEQ: CondCode = AArch64CC::EQ; break; case ISD::SETGT: case ISD::SETOGT: CondCode = AArch64CC::GT; break; case ISD::SETGE: case ISD::SETOGE: CondCode = AArch64CC::GE; break; case ISD::SETOLT: CondCode = AArch64CC::MI; break; case ISD::SETOLE: CondCode = AArch64CC::LS; break; case ISD::SETONE: CondCode = AArch64CC::MI; CondCode2 = AArch64CC::GT; break; case ISD::SETO: CondCode = AArch64CC::VC; break; case ISD::SETUO: CondCode = AArch64CC::VS; break; case ISD::SETUEQ: CondCode = AArch64CC::EQ; CondCode2 = AArch64CC::VS; break; case ISD::SETUGT: CondCode = AArch64CC::HI; break; case ISD::SETUGE: CondCode = AArch64CC::PL; break; case ISD::SETLT: case ISD::SETULT: CondCode = AArch64CC::LT; break; case ISD::SETLE: case ISD::SETULE: CondCode = AArch64CC::LE; break; case ISD::SETNE: case ISD::SETUNE: CondCode = AArch64CC::NE; break; } } /// Convert a DAG fp condition code to an AArch64 CC. /// This differs from changeFPCCToAArch64CC in that it returns cond codes that /// should be AND'ed instead of OR'ed. static void changeFPCCToANDAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2) { CondCode2 = AArch64CC::AL; switch (CC) { default: changeFPCCToAArch64CC(CC, CondCode, CondCode2); assert(CondCode2 == AArch64CC::AL); break; case ISD::SETONE: // (a one b) // == ((a olt b) || (a ogt b)) // == ((a ord b) && (a une b)) CondCode = AArch64CC::VC; CondCode2 = AArch64CC::NE; break; case ISD::SETUEQ: // (a ueq b) // == ((a uno b) || (a oeq b)) // == ((a ule b) && (a uge b)) CondCode = AArch64CC::PL; CondCode2 = AArch64CC::LE; break; } } /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 /// CC usable with the vector instructions. Fewer operations are available /// without a real NZCV register, so we have to use less efficient combinations /// to get the same effect. static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, AArch64CC::CondCode &CondCode, AArch64CC::CondCode &CondCode2, bool &Invert) { Invert = false; switch (CC) { default: // Mostly the scalar mappings work fine. changeFPCCToAArch64CC(CC, CondCode, CondCode2); break; case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; case ISD::SETO: CondCode = AArch64CC::MI; CondCode2 = AArch64CC::GE; break; case ISD::SETUEQ: case ISD::SETULT: case ISD::SETULE: case ISD::SETUGT: case ISD::SETUGE: // All of the compare-mask comparisons are ordered, but we can switch // between the two by a double inversion. E.g. ULE == !OGT. Invert = true; changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); break; } } static bool isLegalArithImmed(uint64_t C) { // Matches AArch64DAGToDAGISel::SelectArithImmed(). bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); LLVM_DEBUG(dbgs() << "Is imm " << C << " legal: " << (IsLegal ? "yes\n" : "no\n")); return IsLegal; } static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { EVT VT = LHS.getValueType(); const bool FullFP16 = static_cast(DAG.getSubtarget()).hasFullFP16(); if (VT.isFloatingPoint()) { assert(VT != MVT::f128); if (VT == MVT::f16 && !FullFP16) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); VT = MVT::f32; } return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); } // The CMP instruction is just an alias for SUBS, and representing it as // SUBS means that it's possible to get CSE with subtract operations. // A later phase can perform the optimization of setting the destination // register to WZR/XZR if it ends up being unused. unsigned Opcode = AArch64ISD::SUBS; if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags // can be set differently by this operation. It comes down to whether // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then // everything is fine. If not then the optimization is wrong. Thus general // comparisons are only valid if op2 != 0. // So, finally, the only LLVM-native comparisons that don't mention C and V // are SETEQ and SETNE. They're the only ones we can safely use CMN for in // the absence of information about op2. Opcode = AArch64ISD::ADDS; RHS = RHS.getOperand(1); } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) { // Similarly, (CMP (and X, Y), 0) can be implemented with a TST // (a.k.a. ANDS) except that the flags are only guaranteed to work for one // of the signed comparisons. Opcode = AArch64ISD::ANDS; RHS = LHS.getOperand(1); LHS = LHS.getOperand(0); } return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS) .getValue(1); } /// \defgroup AArch64CCMP CMP;CCMP matching /// /// These functions deal with the formation of CMP;CCMP;... sequences. /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of /// a comparison. They set the NZCV flags to a predefined value if their /// predicate is false. This allows to express arbitrary conjunctions, for /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))" /// expressed as: /// cmp A /// ccmp B, inv(CB), CA /// check for CB flags /// /// This naturally lets us implement chains of AND operations with SETCC /// operands. And we can even implement some other situations by transforming /// them: /// - We can implement (NEG SETCC) i.e. negating a single comparison by /// negating the flags used in a CCMP/FCCMP operations. /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations /// by negating the flags we test for afterwards. i.e. /// NEG (CMP CCMP CCCMP ...) can be implemented. /// - Note that we can only ever negate all previously processed results. /// What we can not implement by flipping the flags to test is a negation /// of two sub-trees (because the negation affects all sub-trees emitted so /// far, so the 2nd sub-tree we emit would also affect the first). /// With those tools we can implement some OR operations: /// - (OR (SETCC A) (SETCC B)) can be implemented via: /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B))) /// - After transforming OR to NEG/AND combinations we may be able to use NEG /// elimination rules from earlier to implement the whole thing as a /// CCMP/FCCMP chain. /// /// As complete example: /// or (or (setCA (cmp A)) (setCB (cmp B))) /// (and (setCC (cmp C)) (setCD (cmp D)))" /// can be reassociated to: /// or (and (setCC (cmp C)) setCD (cmp D)) // (or (setCA (cmp A)) (setCB (cmp B))) /// can be transformed to: /// not (and (not (and (setCC (cmp C)) (setCD (cmp D)))) /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))" /// which can be implemented as: /// cmp C /// ccmp D, inv(CD), CC /// ccmp A, CA, inv(CD) /// ccmp B, CB, inv(CA) /// check for CB flags /// /// A counterexample is "or (and A B) (and C D)" which translates to /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we /// can only implement 1 of the inner (not) operations, but not both! /// @{ /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate. static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue CCOp, AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC, const SDLoc &DL, SelectionDAG &DAG) { unsigned Opcode = 0; const bool FullFP16 = static_cast(DAG.getSubtarget()).hasFullFP16(); if (LHS.getValueType().isFloatingPoint()) { assert(LHS.getValueType() != MVT::f128); if (LHS.getValueType() == MVT::f16 && !FullFP16) { LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS); } Opcode = AArch64ISD::FCCMP; } else if (RHS.getOpcode() == ISD::SUB) { SDValue SubOp0 = RHS.getOperand(0); if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // See emitComparison() on why we can only do this for SETEQ and SETNE. Opcode = AArch64ISD::CCMN; RHS = RHS.getOperand(1); } } if (Opcode == 0) Opcode = AArch64ISD::CCMP; SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC); AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC); unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC); SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32); return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp); } /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be /// expressed as a conjunction. See \ref AArch64CCMP. /// \param CanNegate Set to true if we can negate the whole sub-tree just by /// changing the conditions on the SETCC tests. /// (this means we can call emitConjunctionRec() with /// Negate==true on this sub-tree) /// \param MustBeFirst Set to true if this subtree needs to be negated and we /// cannot do the negation naturally. We are required to /// emit the subtree first in this case. /// \param WillNegate Is true if are called when the result of this /// subexpression must be negated. This happens when the /// outer expression is an OR. We can use this fact to know /// that we have a double negation (or (or ...) ...) that /// can be implemented for free. static bool canEmitConjunction(const SDValue Val, bool &CanNegate, bool &MustBeFirst, bool WillNegate, unsigned Depth = 0) { if (!Val.hasOneUse()) return false; unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { if (Val->getOperand(0).getValueType() == MVT::f128) return false; CanNegate = true; MustBeFirst = false; return true; } // Protect against exponential runtime and stack overflow. if (Depth > 6) return false; if (Opcode == ISD::AND || Opcode == ISD::OR) { bool IsOR = Opcode == ISD::OR; SDValue O0 = Val->getOperand(0); SDValue O1 = Val->getOperand(1); bool CanNegateL; bool MustBeFirstL; if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1)) return false; bool CanNegateR; bool MustBeFirstR; if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1)) return false; if (MustBeFirstL && MustBeFirstR) return false; if (IsOR) { // For an OR expression we need to be able to naturally negate at least // one side or we cannot do the transformation at all. if (!CanNegateL && !CanNegateR) return false; // If we the result of the OR will be negated and we can naturally negate // the leafs, then this sub-tree as a whole negates naturally. CanNegate = WillNegate && CanNegateL && CanNegateR; // If we cannot naturally negate the whole sub-tree, then this must be // emitted first. MustBeFirst = !CanNegate; } else { assert(Opcode == ISD::AND && "Must be OR or AND"); // We cannot naturally negate an AND operation. CanNegate = false; MustBeFirst = MustBeFirstL || MustBeFirstR; } return true; } return false; } /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain /// of CCMP/CFCMP ops. See @ref AArch64CCMP. /// Tries to transform the given i1 producing node @p Val to a series compare /// and conditional compare operations. @returns an NZCV flags producing node /// and sets @p OutCC to the flags that should be tested or returns SDValue() if /// transformation was not possible. /// \p Negate is true if we want this sub-tree being negated just by changing /// SETCC conditions. static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp, AArch64CC::CondCode Predicate) { // We're at a tree leaf, produce a conditional comparison operation. unsigned Opcode = Val->getOpcode(); if (Opcode == ISD::SETCC) { SDValue LHS = Val->getOperand(0); SDValue RHS = Val->getOperand(1); ISD::CondCode CC = cast(Val->getOperand(2))->get(); bool isInteger = LHS.getValueType().isInteger(); if (Negate) CC = getSetCCInverse(CC, isInteger); SDLoc DL(Val); // Determine OutCC and handle FP special case. if (isInteger) { OutCC = changeIntCCToAArch64CC(CC); } else { assert(LHS.getValueType().isFloatingPoint()); AArch64CC::CondCode ExtraCC; changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC); // Some floating point conditions can't be tested with a single condition // code. Construct an additional comparison in this case. if (ExtraCC != AArch64CC::AL) { SDValue ExtraCmp; if (!CCOp.getNode()) ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG); else ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, ExtraCC, DL, DAG); CCOp = ExtraCmp; Predicate = ExtraCC; } } // Produce a normal comparison if we are first in the chain if (!CCOp) return emitComparison(LHS, RHS, CC, DL, DAG); // Otherwise produce a ccmp. return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL, DAG); } assert(Val->hasOneUse() && "Valid conjunction/disjunction tree"); bool IsOR = Opcode == ISD::OR; SDValue LHS = Val->getOperand(0); bool CanNegateL; bool MustBeFirstL; bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR); assert(ValidL && "Valid conjunction/disjunction tree"); (void)ValidL; SDValue RHS = Val->getOperand(1); bool CanNegateR; bool MustBeFirstR; bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR); assert(ValidR && "Valid conjunction/disjunction tree"); (void)ValidR; // Swap sub-tree that must come first to the right side. if (MustBeFirstL) { assert(!MustBeFirstR && "Valid conjunction/disjunction tree"); std::swap(LHS, RHS); std::swap(CanNegateL, CanNegateR); std::swap(MustBeFirstL, MustBeFirstR); } bool NegateR; bool NegateAfterR; bool NegateL; bool NegateAfterAll; if (Opcode == ISD::OR) { // Swap the sub-tree that we can negate naturally to the left. if (!CanNegateL) { assert(CanNegateR && "at least one side must be negatable"); assert(!MustBeFirstR && "invalid conjunction/disjunction tree"); assert(!Negate); std::swap(LHS, RHS); NegateR = false; NegateAfterR = true; } else { // Negate the left sub-tree if possible, otherwise negate the result. NegateR = CanNegateR; NegateAfterR = !CanNegateR; } NegateL = true; NegateAfterAll = !Negate; } else { assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree"); assert(!Negate && "Valid conjunction/disjunction tree"); NegateL = false; NegateR = false; NegateAfterR = false; NegateAfterAll = false; } // Emit sub-trees. AArch64CC::CondCode RHSCC; SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate); if (NegateAfterR) RHSCC = AArch64CC::getInvertedCondCode(RHSCC); SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC); if (NegateAfterAll) OutCC = AArch64CC::getInvertedCondCode(OutCC); return CmpL; } /// Emit expression as a conjunction (a series of CCMP/CFCMP ops). /// In some cases this is even possible with OR operations in the expression. /// See \ref AArch64CCMP. /// \see emitConjunctionRec(). static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val, AArch64CC::CondCode &OutCC) { bool DummyCanNegate; bool DummyMustBeFirst; if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false)) return SDValue(); return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL); } /// @} static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &AArch64cc, SelectionDAG &DAG, const SDLoc &dl) { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { EVT VT = RHS.getValueType(); uint64_t C = RHSC->getZExtValue(); if (!isLegalArithImmed(C)) { // Constant does not fit, try adjusting it by one? switch (CC) { default: break; case ISD::SETLT: case ISD::SETGE: if ((VT == MVT::i32 && C != 0x80000000 && isLegalArithImmed((uint32_t)(C - 1))) || (VT == MVT::i64 && C != 0x80000000ULL && isLegalArithImmed(C - 1ULL))) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETULT: case ISD::SETUGE: if ((VT == MVT::i32 && C != 0 && isLegalArithImmed((uint32_t)(C - 1))) || (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETLE: case ISD::SETGT: if ((VT == MVT::i32 && C != INT32_MAX && isLegalArithImmed((uint32_t)(C + 1))) || (VT == MVT::i64 && C != INT64_MAX && isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; RHS = DAG.getConstant(C, dl, VT); } break; case ISD::SETULE: case ISD::SETUGT: if ((VT == MVT::i32 && C != UINT32_MAX && isLegalArithImmed((uint32_t)(C + 1))) || (VT == MVT::i64 && C != UINT64_MAX && isLegalArithImmed(C + 1ULL))) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; RHS = DAG.getConstant(C, dl, VT); } break; } } } SDValue Cmp; AArch64CC::CondCode AArch64CC; if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa(RHS)) { const ConstantSDNode *RHSC = cast(RHS); // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. // For the i8 operand, the largest immediate is 255, so this can be easily // encoded in the compare instruction. For the i16 operand, however, the // largest immediate cannot be encoded in the compare. // Therefore, use a sign extending load and cmn to avoid materializing the // -1 constant. For example, // movz w1, #65535 // ldrh w0, [x0, #0] // cmp w0, w1 // > // ldrsh w0, [x0, #0] // cmn w0, #1 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) // if and only if (sext LHS) == (sext RHS). The checks are in place to // ensure both the LHS and RHS are truly zero extended and to make sure the // transformation is profitable. if ((RHSC->getZExtValue() >> 16 == 0) && isa(LHS) && cast(LHS)->getExtensionType() == ISD::ZEXTLOAD && cast(LHS)->getMemoryVT() == MVT::i16 && LHS.getNode()->hasNUsesOfValue(1, 0)) { int16_t ValueofRHS = cast(RHS)->getZExtValue(); if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, DAG.getValueType(MVT::i16)); Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl, RHS.getValueType()), CC, dl, DAG); AArch64CC = changeIntCCToAArch64CC(CC); } } if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) { if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) { if ((CC == ISD::SETNE) ^ RHSC->isNullValue()) AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC); } } } if (!Cmp) { Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC = changeIntCCToAArch64CC(CC); } AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC); return Cmp; } static std::pair getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && "Unsupported value type"); SDValue Value, Overflow; SDLoc DL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); unsigned Opc = 0; switch (Op.getOpcode()) { default: llvm_unreachable("Unknown overflow instruction!"); case ISD::SADDO: Opc = AArch64ISD::ADDS; CC = AArch64CC::VS; break; case ISD::UADDO: Opc = AArch64ISD::ADDS; CC = AArch64CC::HS; break; case ISD::SSUBO: Opc = AArch64ISD::SUBS; CC = AArch64CC::VS; break; case ISD::USUBO: Opc = AArch64ISD::SUBS; CC = AArch64CC::LO; break; // Multiply needs a little bit extra work. case ISD::SMULO: case ISD::UMULO: { CC = AArch64CC::NE; bool IsSigned = Op.getOpcode() == ISD::SMULO; if (Op.getValueType() == MVT::i32) { unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; // For a 32 bit multiply with overflow check we want the instruction // selector to generate a widening multiply (SMADDL/UMADDL). For that we // need to generate the following pattern: // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, DAG.getConstant(0, DL, MVT::i64)); // On AArch64 the upper 32 bits are always zero extended for a 32 bit // operation. We need to clear out the upper 32 bits, because we used a // widening multiply that wrote all 64 bits. In the end this should be a // noop. Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); if (IsSigned) { // The signed overflow check requires more than just a simple check for // any bit set in the upper 32 bits of the result. These bits could be // just the sign bits of a negative number. To perform the overflow // check we have to arithmetic shift right the 32nd bit of the result by // 31 bits. Then we compare the result to the upper 32 bits. SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, DAG.getConstant(32, DL, MVT::i64)); UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, DAG.getConstant(31, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) .getValue(1); } else { // The overflow check for unsigned multiply is easy. We only need to // check if any of the upper 32 bits are set. This can be done with a // CMP (shifted register). For that we need to generate the following // pattern: // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, DAG.getConstant(32, DL, MVT::i64)); SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, DL, MVT::i64), UpperBits).getValue(1); } break; } assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); // For the 64 bit multiply Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); if (IsSigned) { SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, DAG.getConstant(63, DL, MVT::i64)); // It is important that LowerBits is last, otherwise the arithmetic // shift will not be folded into the compare (SUBS). SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) .getValue(1); } else { SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, DL, MVT::i64), UpperBits).getValue(1); } break; } } // switch (...) if (Opc) { SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); // Emit the AArch64 operation with overflow check. Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); Overflow = Value.getValue(1); } return std::make_pair(Value, Overflow); } SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const { SmallVector Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first; } // Returns true if the given Op is the overflow flag result of an overflow // intrinsic operation. static bool isOverflowIntrOpRes(SDValue Op) { unsigned Opc = Op.getOpcode(); return (Op.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)); } static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { SDValue Sel = Op.getOperand(0); SDValue Other = Op.getOperand(1); SDLoc dl(Sel); // If the operand is an overflow checking operation, invert the condition // code and kill the Not operation. I.e., transform: // (xor (overflow_op_bool, 1)) // --> // (csel 1, 0, invert(cc), overflow_op_bool) // ... which later gets transformed to just a cset instruction with an // inverted condition code, rather than a cset + eor sequence. if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0))) return SDValue(); SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); AArch64CC::CondCode CC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG); SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal, CCVal, Overflow); } // If neither operand is a SELECT_CC, give up. if (Sel.getOpcode() != ISD::SELECT_CC) std::swap(Sel, Other); if (Sel.getOpcode() != ISD::SELECT_CC) return Op; // The folding we want to perform is: // (xor x, (select_cc a, b, cc, 0, -1) ) // --> // (csel x, (xor x, -1), cc ...) // // The latter will get matched to a CSINV instruction. ISD::CondCode CC = cast(Sel.getOperand(4))->get(); SDValue LHS = Sel.getOperand(0); SDValue RHS = Sel.getOperand(1); SDValue TVal = Sel.getOperand(2); SDValue FVal = Sel.getOperand(3); // FIXME: This could be generalized to non-integer comparisons. if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return Op; ConstantSDNode *CFVal = dyn_cast(FVal); ConstantSDNode *CTVal = dyn_cast(TVal); // The values aren't constants, this isn't the pattern we're looking for. if (!CFVal || !CTVal) return Op; // We can commute the SELECT_CC by inverting the condition. This // might be needed to make this fit into a CSINV pattern. if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } // If the constants line up, perform the transform! if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); FVal = Other; TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, DAG.getConstant(-1ULL, dl, Other.getValueType())); return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, CCVal, Cmp); } return Op; } static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned Opc; bool ExtraOp = false; switch (Op.getOpcode()) { default: llvm_unreachable("Invalid code"); case ISD::ADDC: Opc = AArch64ISD::ADDS; break; case ISD::SUBC: Opc = AArch64ISD::SUBS; break; case ISD::ADDE: Opc = AArch64ISD::ADCS; ExtraOp = true; break; case ISD::SUBE: Opc = AArch64ISD::SBCS; ExtraOp = true; break; } if (!ExtraOp) return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) return SDValue(); SDLoc dl(Op); AArch64CC::CondCode CC; // The actual operation that sets the overflow or carry flag. SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); // We use 0 and 1 as false and true values. SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); // We use an inverted condition, because the conditional select is inverted // too. This will allow it to be selected to a single instruction: // CSINC Wd, WZR, WZR, invert(cond). SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32); Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal, CCVal, Overflow); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } // Prefetch operands are: // 1: Address to prefetch // 2: bool isWrite // 3: int locality (0 = no locality ... 3 = extreme locality) // 4: bool isDataCache static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); unsigned IsWrite = cast(Op.getOperand(2))->getZExtValue(); unsigned Locality = cast(Op.getOperand(3))->getZExtValue(); unsigned IsData = cast(Op.getOperand(4))->getZExtValue(); bool IsStream = !Locality; // When the locality number is set if (Locality) { // The front-end should have filtered out the out-of-range values assert(Locality <= 3 && "Prefetch locality out-of-range"); // The locality degree is the opposite of the cache speed. // Put the number the other way around. // The encoding starts at 0 for level 1 Locality = 3 - Locality; } // built the mask value encoding the expected behavior. unsigned PrfOp = (IsWrite << 4) | // Load/Store bit (!IsData << 3) | // IsDataCache bit (Locality << 1) | // Cache level bits (unsigned)IsStream; // Stream bit return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1)); } SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); RTLIB::Libcall LC; LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); return LowerF128Call(Op, DAG, LC); } SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { if (Op.getOperand(0).getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); // FP_ROUND node has a second operand indicating whether it is known to be // precise. That doesn't take part in the LibCall so we can't directly use // LowerF128Call. SDValue SrcVal = Op.getOperand(0); return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, SDLoc(Op)).first; } static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. EVT InVT = Op.getOperand(0).getValueType(); EVT VT = Op.getValueType(); unsigned NumElts = InVT.getVectorNumElements(); // f16 vectors are promoted to f32 before a conversion. if (InVT.getVectorElementType() == MVT::f16) { MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); SDLoc dl(Op); return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); } if (VT.getSizeInBits() < InVT.getSizeInBits()) { SDLoc dl(Op); SDValue Cv = DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); } if (VT.getSizeInBits() > InVT.getSizeInBits()) { SDLoc dl(Op); MVT ExtVT = MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), VT.getVectorNumElements()); SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); return DAG.getNode(Op.getOpcode(), dl, VT, Ext); } // Type changing conversions are illegal. return Op; } SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { if (Op.getOperand(0).getValueType().isVector()) return LowerVectorFP_TO_INT(Op, DAG); // f16 conversions are promoted to f32 when full fp16 is not supported. if (Op.getOperand(0).getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { SDLoc dl(Op); return DAG.getNode( Op.getOpcode(), dl, Op.getValueType(), DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0))); } if (Op.getOperand(0).getValueType() != MVT::f128) { // It's legal except when f128 is involved return Op; } RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); SmallVector Ops(Op->op_begin(), Op->op_end()); return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. // Any additional optimization in this function should be recorded // in the cost tables. EVT VT = Op.getValueType(); SDLoc dl(Op); SDValue In = Op.getOperand(0); EVT InVT = In.getValueType(); if (VT.getSizeInBits() < InVT.getSizeInBits()) { MVT CastVT = MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), InVT.getVectorNumElements()); In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl)); } if (VT.getSizeInBits() > InVT.getSizeInBits()) { unsigned CastOpc = Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; EVT CastVT = VT.changeVectorElementTypeToInteger(); In = DAG.getNode(CastOpc, dl, CastVT, In); return DAG.getNode(Op.getOpcode(), dl, VT, In); } return Op; } SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVectorINT_TO_FP(Op, DAG); // f16 conversions are promoted to f32 when full fp16 is not supported. if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { SDLoc dl(Op); return DAG.getNode( ISD::FP_ROUND, dl, MVT::f16, DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)), DAG.getIntPtrConstant(0, dl)); } // i128 conversions are libcalls. if (Op.getOperand(0).getValueType() == MVT::i128) return SDValue(); // Other conversions are legal, unless it's to the completely software-based // fp128. if (Op.getValueType() != MVT::f128) return Op; RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); return LowerF128Call(Op, DAG, LC); } SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { // For iOS, we want to call an alternative entry point: __sincos_stret, // which returns the values in two S / D registers. SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); ArgListTy Args; ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; const char *LibcallName = getLibcallName(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); StructType *RetTy = StructType::get(ArgTy, ArgTy); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args)); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; } static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { if (Op.getValueType() != MVT::f16) return SDValue(); assert(Op.getOperand(0).getValueType() == MVT::i16); SDLoc DL(Op); Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); return SDValue( DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); } static EVT getExtensionTo64Bits(const EVT &OrigVT) { if (OrigVT.getSizeInBits() >= 64) return OrigVT; assert(OrigVT.isSimple() && "Expecting a simple value type"); MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; switch (OrigSimpleTy) { default: llvm_unreachable("Unexpected Vector Type"); case MVT::v2i8: case MVT::v2i16: return MVT::v2i32; case MVT::v4i8: return MVT::v4i16; } } static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode) { // The vector originally had a size of OrigTy. It was then extended to ExtTy. // We expect the ExtTy to be 128-bits total. If the OrigTy is less than // 64-bits we need to insert a new extension so that it will be 64-bits. assert(ExtTy.is128BitVector() && "Unexpected extension size"); if (OrigTy.getSizeInBits() >= 64) return N; // Must extend size to at least 64 bits to be used as an operand for VMULL. EVT NewVT = getExtensionTo64Bits(OrigTy); return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { EVT VT = N->getValueType(0); if (N->getOpcode() != ISD::BUILD_VECTOR) return false; for (const SDValue &Elt : N->op_values()) { if (ConstantSDNode *C = dyn_cast(Elt)) { unsigned EltSize = VT.getScalarSizeInBits(); unsigned HalfSize = EltSize / 2; if (isSigned) { if (!isIntN(HalfSize, C->getSExtValue())) return false; } else { if (!isUIntN(HalfSize, C->getZExtValue())) return false; } continue; } return false; } return true; } static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), N->getOpcode()); assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); EVT VT = N->getValueType(0); SDLoc dl(N); unsigned EltSize = VT.getScalarSizeInBits() / 2; unsigned NumElts = VT.getVectorNumElements(); MVT TruncVT = MVT::getIntegerVT(EltSize); SmallVector Ops; for (unsigned i = 0; i != NumElts; ++i) { ConstantSDNode *C = cast(N->getOperand(i)); const APInt &CInt = C->getAPIntValue(); // Element types smaller than 32 bits are not legal, so use i32 elements. // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); } static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { return N->getOpcode() == ISD::SIGN_EXTEND || isExtendedBUILD_VECTOR(N, DAG, true); } static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { return N->getOpcode() == ISD::ZERO_EXTEND || isExtendedBUILD_VECTOR(N, DAG, false); } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isSignExtended(N0, DAG) && isSignExtended(N1, DAG); } return false; } static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); } return false; } SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { // The rounding mode is in bits 23:22 of the FPSCR. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)); SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, DAG.getConstant(3, dl, MVT::i32)); } static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. EVT VT = Op.getValueType(); assert(VT.is128BitVector() && VT.isInteger() && "unexpected type for custom-lowering ISD::MUL"); SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; bool isMLA = false; bool isN0SExt = isSignExtended(N0, DAG); bool isN1SExt = isSignExtended(N1, DAG); if (isN0SExt && isN1SExt) NewOpc = AArch64ISD::SMULL; else { bool isN0ZExt = isZeroExtended(N0, DAG); bool isN1ZExt = isZeroExtended(N1, DAG); if (isN0ZExt && isN1ZExt) NewOpc = AArch64ISD::UMULL; else if (isN1SExt || isN1ZExt) { // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these // into (s/zext A * s/zext C) + (s/zext B * s/zext C) if (isN1SExt && isAddSubSExt(N0, DAG)) { NewOpc = AArch64ISD::SMULL; isMLA = true; } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { NewOpc = AArch64ISD::UMULL; isMLA = true; } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { std::swap(N0, N1); NewOpc = AArch64ISD::UMULL; isMLA = true; } } if (!NewOpc) { if (VT == MVT::v2i64) // Fall through to expand this. It is not legal. return SDValue(); else // Other vector multiplications are legal. return Op; } } // Legalize to a S/UMULL instruction SDLoc DL(Op); SDValue Op0; SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); if (!isMLA) { Op0 = skipExtensionForVectorMULL(N0, DAG); assert(Op0.getValueType().is64BitVector() && Op1.getValueType().is64BitVector() && "unexpected types for extended operands to VMULL"); return DAG.getNode(NewOpc, DL, VT, Op0, Op1); } // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during // isel lowering to take advantage of no-stall back to back s/umul + s/umla. // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); EVT Op1VT = Op1.getValueType(); return DAG.getNode(N0->getOpcode(), DL, VT, DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } // Lower vector multiply high (ISD::MULHS and ISD::MULHU). static SDValue LowerMULH(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // {S,U}MULL{2} can be detected. Otherwise v2i64 multiplications are not // legal. EVT VT = Op.getValueType(); assert(VT.is128BitVector() && VT.isInteger() && "unexpected type for custom-lowering ISD::MULH{U,S}"); SDValue V0 = Op.getOperand(0); SDValue V1 = Op.getOperand(1); SDLoc DL(Op); EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext()); // We turn (V0 mulhs/mulhu V1) to: // // (uzp2 (smull (extract_subvector (ExtractVT V128:V0, (i64 0)), // (extract_subvector (ExtractVT V128:V1, (i64 0))))), // (smull (extract_subvector (ExtractVT V128:V0, (i64 VMull2Idx)), // (extract_subvector (ExtractVT V128:V2, (i64 VMull2Idx)))))) // // Where ExtractVT is a subvector with half number of elements, and // VMullIdx2 is the index of the middle element (the high part). // // The vector hight part extract and multiply will be matched against // {S,U}MULL{v16i8_v8i16,v8i16_v4i32,v4i32_v2i64} which in turn will // issue a {s}mull2 instruction. // // This basically multiply the lower subvector with '{s,u}mull', the high // subvector with '{s,u}mull2', and shuffle both results high part in // resulting vector. unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2; SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64); SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64); SDValue VMullV0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx); SDValue VMullV1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx); SDValue VMull2V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx); SDValue VMull2V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx); unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL : AArch64ISD::UMULL; EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext()); SDValue Mull = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1); SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1); Mull = DAG.getNode(ISD::BITCAST, DL, VT, Mull); Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2); return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2); } SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT); } case Intrinsic::aarch64_neon_abs: return DAG.getNode(ISD::ABS, dl, Op.getValueType(), Op.getOperand(1)); case Intrinsic::aarch64_neon_smax: return DAG.getNode(ISD::SMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_umax: return DAG.getNode(ISD::UMAX, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_smin: return DAG.getNode(ISD::SMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::aarch64_neon_umin: return DAG.getNode(ISD::UMIN, dl, Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } } // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, SelectionDAG &DAG) { assert(VT.isVector() && "VT should be a vector type"); assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); SDValue Value = ST->getValue(); // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract // the word lane which represent the v4i8 subvector. It optimizes the store // to: // // xtn v0.8b, v0.8h // str s0, [x0] SDValue Undef = DAG.getUNDEF(MVT::i16); SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, {Undef, Undef, Undef, Undef}); SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Value, UndefVec); SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Trunc, DAG.getConstant(0, DL, MVT::i64)); return DAG.getStore(ST->getChain(), DL, ExtractTrunc, ST->getBasePtr(), ST->getMemOperand()); } // Custom lowering for any store, vector or scalar and/or default or with // a truncate operations. Currently only custom lower truncate operation // from vector v4i16 to v4i8. SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc Dl(Op); StoreSDNode *StoreNode = cast(Op); assert (StoreNode && "Can only custom lower store nodes"); SDValue Value = StoreNode->getValue(); EVT VT = Value.getValueType(); EVT MemVT = StoreNode->getMemoryVT(); assert (VT.isVector() && "Can only custom lower vector store types"); unsigned AS = StoreNode->getAddressSpace(); unsigned Align = StoreNode->getAlignment(); if (Align < MemVT.getStoreSize() && !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) { return scalarizeVectorStore(StoreNode, DAG); } if (StoreNode->isTruncatingStore()) { return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); } return SDValue(); } SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); LLVM_DEBUG(Op.dump()); switch (Op.getOpcode()) { default: llvm_unreachable("unimplemented operand"); return SDValue(); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::ADDC: case ISD::ADDE: case ISD::SUBC: case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: case ISD::USUBO: case ISD::SMULO: case ISD::UMULO: return LowerXALUO(Op, DAG); case ISD::FADD: return LowerF128Call(Op, DAG, RTLIB::ADD_F128); case ISD::FSUB: return LowerF128Call(Op, DAG, RTLIB::SUB_F128); case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128); case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerVectorSRA_SRL_SHL(Op, DAG); case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); case ISD::CTPOP: return LowerCTPOP(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::AND: return LowerVectorAND(Op, DAG); case ISD::OR: return LowerVectorOR(Op, DAG); case ISD::XOR: return LowerXOR(Op, DAG); case ISD::PREFETCH: return LowerPREFETCH(Op, DAG); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::MULHS: case ISD::MULHU: return LowerMULH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: case ISD::VECREDUCE_FMAX: case ISD::VECREDUCE_FMIN: return LowerVECREDUCE(Op, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerATOMIC_LOAD_SUB(Op, DAG); case ISD::ATOMIC_LOAD_AND: return LowerATOMIC_LOAD_AND(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); } } //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// #include "AArch64GenCallingConv.inc" /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const { switch (CC) { default: report_fatal_error("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: return CC_AArch64_GHC; case CallingConv::C: case CallingConv::Fast: case CallingConv::PreserveMost: case CallingConv::CXX_FAST_TLS: case CallingConv::Swift: if (Subtarget->isTargetWindows() && IsVarArg) return CC_AArch64_Win64_VarArg; if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; case CallingConv::Win64: return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; } } CCAssignFn * AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; } SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); // At this point, Ins[].VT may already be promoted to i32. To correctly // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here // we use a special version of AnalyzeFormalArguments to pass in ValVT and // LocVT. unsigned NumArgs = Ins.size(); Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Ins[i].VT; if (Ins[i].isOrigArg()) { std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[i].getOrigArgIndex(); // Get type of the original argument. EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(), /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) ValVT = MVT::i8; else if (ActualMVT == MVT::i16) ValVT = MVT::i16; } CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } assert(ArgLocs.size() == Ins.size()); SmallVector ArgValues; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (Ins[i].Flags.isByVal()) { // Byval is used for HFAs in the PCS, but the system should work in a // non-compliant manner for larger structs. EVT PtrVT = getPointerTy(DAG.getDataLayout()); int Size = Ins[i].Flags.getByValSize(); unsigned NumRegs = (Size + 7) / 8; // FIXME: This works on big-endian for composite byvals, which are the common // case. It should also work for fundamental types too. unsigned FrameIdx = MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT); InVals.push_back(FrameIdxN); continue; } if (VA.isRegLoc()) { // Arguments stored in registers. EVT RegVT = VA.getLocVT(); SDValue ArgValue; const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = &AArch64::GPR32RegClass; else if (RegVT == MVT::i64) RC = &AArch64::GPR64RegClass; else if (RegVT == MVT::f16) RC = &AArch64::FPR16RegClass; else if (RegVT == MVT::f32) RC = &AArch64::FPR32RegClass; else if (RegVT == MVT::f64 || RegVT.is64BitVector()) RC = &AArch64::FPR64RegClass; else if (RegVT == MVT::f128 || RegVT.is128BitVector()) RC = &AArch64::FPR128RegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); // If this is an 8, 16 or 32-bit value, it is really passed promoted // to 64 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); break; case CCValAssign::AExt: case CCValAssign::SExt: case CCValAssign::ZExt: // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt // nodes after our lowering. assert(RegVT == Ins[i].VT && "incorrect register location selected"); break; } InVals.push_back(ArgValue); } else { // VA.isRegLoc() assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); unsigned ArgOffset = VA.getLocMemOffset(); unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; uint32_t BEAlign = 0; if (!Subtarget->isLittleEndian() && ArgSize < 8 && !Ins[i].Flags.isInConsecutiveRegs()) BEAlign = 8 - ArgSize; int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); SDValue ArgValue; // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; MVT MemVT = VA.getValVT(); switch (VA.getLocInfo()) { default: break; case CCValAssign::BCvt: MemVT = VA.getLocVT(); break; case CCValAssign::SExt: ExtType = ISD::SEXTLOAD; break; case CCValAssign::ZExt: ExtType = ISD::ZEXTLOAD; break; case CCValAssign::AExt: ExtType = ISD::EXTLOAD; break; } ArgValue = DAG.getExtLoad( ExtType, DL, VA.getLocVT(), Chain, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), MemVT); InVals.push_back(ArgValue); } } // varargs AArch64FunctionInfo *FuncInfo = MF.getInfo(); if (isVarArg) { if (!Subtarget->isTargetDarwin() || IsWin64) { // The AAPCS variadic function ABI is identical to the non-variadic // one. As a result there may be more arguments in registers and we should // save them for future reference. // Win64 variadic functions also pass arguments in registers, but all float // arguments are passed in integer registers. saveVarArgRegisters(CCInfo, DAG, DL, Chain); } // This will point to the next argument passed via stack. unsigned StackOffset = CCInfo.getNextStackOffset(); // We currently pass all varargs at 8-byte alignment. StackOffset = ((StackOffset + 7) & ~7); FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true)); } unsigned StackArgSize = CCInfo.getNextStackOffset(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { // This is a non-standard ABI so by fiat I say we're allowed to make full // use of the stack area to be popped, which must be aligned to 16 bytes in // any case: StackArgSize = alignTo(StackArgSize, 16); // If we're expected to restore the stack (e.g. fastcc) then we'll be adding // a multiple of 16. FuncInfo->setArgumentStackToRestore(StackArgSize); // This realignment carries over to the available bytes below. Our own // callers will guarantee the space is free by giving an aligned value to // CALLSEQ_START. } // Even if we're not expected to free up the space, it's useful to know how // much is there while considering tail calls (because we can reuse it). FuncInfo->setBytesInStackArgArea(StackArgSize); return Chain; } void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL, SDValue &Chain) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); auto PtrVT = getPointerTy(DAG.getDataLayout()); bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()); SmallVector MemOps; static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7 }; static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs); unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; if (GPRSaveSize != 0) { if (IsWin64) { GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); if (GPRSaveSize & 15) // The extra size here, if triggered, will always be 8. MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false); } else GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, IsWin64 ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), GPRIdx, (i - FirstVariadicGPR) * 8) : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); } } FuncInfo->setVarArgsGPRIndex(GPRIdx); FuncInfo->setVarArgsGPRSize(GPRSaveSize); if (Subtarget->hasFPARMv8() && !IsWin64) { static const MCPhysReg FPRArgRegs[] = { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs); unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); int FPRIdx = 0; if (FPRSaveSize != 0) { FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false); SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT); for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(16, DL, PtrVT)); } } FuncInfo->setVarArgsFPRIndex(FPRIdx); FuncInfo->setVarArgsFPRSize(FPRSaveSize); } if (!MemOps.empty()) { Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } } /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue AArch64TargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; // Assign locations to each value returned by this call. SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference if (i == 0 && isThisReturn) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); continue; } SDValue Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); InFlag = Val.getValue(2); switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); break; } InVals.push_back(Val); } return Chain; } /// Return true if the calling convention is one that we can guarantee TCO for. static bool canGuaranteeTCO(CallingConv::ID CC) { return CC == CallingConv::Fast; } /// Return true if we might ever do TCO for calls with this calling convention. static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { case CallingConv::C: case CallingConv::PreserveMost: case CallingConv::Swift: return true; default: return canGuaranteeTCO(CC); } } bool AArch64TargetLowering::isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const { if (!mayTailCallThisCC(CalleeCC)) return false; MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); bool CCMatch = CallerCC == CalleeCC; // Byval parameters hand the function a pointer directly into the stack area // we want to reuse during a tail call. Working around this *is* possible (see // X86) but less efficient and uglier in LowerCall. for (Function::const_arg_iterator i = CallerF.arg_begin(), e = CallerF.arg_end(); i != e; ++i) if (i->hasByValAttr()) return false; if (getTargetMachine().Options.GuaranteedTailCallOpt) return canGuaranteeTCO(CalleeCC) && CCMatch; // Externally-defined functions with weak linkage should not be // tail-called on AArch64 when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls // to undefined weak functions to be replaced with a NOP or jump to the // next instruction. The behaviour of branch instructions in this // situation (as used for tail calls) is implementation-defined, so we // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } // Now we search for cases where we can use a tail call without changing the // ABI. Sibcall is used in some places (particularly gcc) to refer to this // concept. // I want anyone implementing a new calling convention to think long and hard // about this assert. assert((!isVarArg || CalleeCC == CallingConv::C) && "Unexpected variadic calling convention"); LLVMContext &C = *DAG.getContext(); if (isVarArg && !Outs.empty()) { // At least two cases here: if caller is fastcc then we can't have any // memory arguments (we'd be expected to clean up the stack afterwards). If // caller is C then we could potentially use its argument area. // FIXME: for now we take the most conservative of these in both cases: // disallow all variadic memory operands. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); for (const CCValAssign &ArgLoc : ArgLocs) if (!ArgLoc.isRegLoc()) return false; } // Check that the call results are passed in the same way. if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, CCAssignFnForCall(CalleeCC, isVarArg), CCAssignFnForCall(CallerCC, isVarArg))) return false; // The callee has to preserve all registers the caller needs to preserve. const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (!CCMatch) { const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } // Nothing more to check if the callee is taking no arguments if (Outs.empty()) return true; SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); const AArch64FunctionInfo *FuncInfo = MF.getInfo(); // If the stack arguments for this call do not fit into our own save area then // the call cannot be made tail. if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) return false; const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) return false; return true; } SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const { SmallVector ArgChains; int64_t FirstByte = MFI.getObjectOffset(ClobberedFI); int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1; // Include the original chain at the beginning of the list. When this is // used by target LowerCall hooks, this helps legalize find the // CALLSEQ_BEGIN node. ArgChains.push_back(Chain); // Add a chain value for each stack argument corresponding for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U) if (LoadSDNode *L = dyn_cast(*U)) if (FrameIndexSDNode *FI = dyn_cast(L->getBasePtr())) if (FI->getIndex() < 0) { int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex()); int64_t InLastByte = InFirstByte; InLastByte += MFI.getObjectSize(FI->getIndex()) - 1; if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || (FirstByte <= InFirstByte && InFirstByte <= LastByte)) ArgChains.push_back(SDValue(L, 1)); } // Build a tokenfactor for all the chains. return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); } bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const { return CallCC == CallingConv::Fast && TailCallOpt; } /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, /// and add input and output parameter nodes. SDValue AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &DL = CLI.DL; SmallVector &Outs = CLI.Outs; SmallVector &OutVals = CLI.OutVals; SmallVector &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool IsThisReturn = false; AArch64FunctionInfo *FuncInfo = MF.getInfo(); bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; bool IsSibCall = false; if (IsTailCall) { // Check if it's really possible to do a tail call. IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // A sibling call is one where we're under the usual C ABI and not planning // to change that but can still do a tail call: if (!TailCallOpt && IsTailCall) IsSibCall = true; if (IsTailCall) ++NumTailCalls; } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); if (IsVarArg) { // Handle fixed and variable vector arguments differently. // Variable vector arguments always go into memory. unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT ArgVT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/ !Outs[i].IsFixed); bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } } else { // At this point, Outs[].VT may already be promoted to i32. To correctly // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here // we use a special version of AnalyzeCallOperands to pass in ValVT and // LocVT. unsigned NumArgs = Outs.size(); for (unsigned i = 0; i != NumArgs; ++i) { MVT ValVT = Outs[i].VT; // Get type of the original argument. EVT ActualVT = getValueType(DAG.getDataLayout(), CLI.getArgs()[Outs[i].OrigArgIndex].Ty, /*AllowUnknown*/ true); MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) ValVT = MVT::i8; else if (ActualMVT == MVT::i16) ValVT = MVT::i16; CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); assert(!Res && "Call operand has unhandled type"); (void)Res; } } // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); if (IsSibCall) { // Since we're not changing the ABI to make this a tail call, the memory // operands are already available in the caller's incoming argument space. NumBytes = 0; } // FPDiff is the byte offset of the call's argument area from the callee's. // Stores to callee stack arguments will be placed in FixedStackSlots offset // by this amount for a tail call. In a sibling call it must be 0 because the // caller will deallocate the entire stack and the callee still expects its // arguments to begin at SP+0. Completely unused for non-tail calls. int FPDiff = 0; if (IsTailCall && !IsSibCall) { unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); // Since callee will pop argument stack as a tail call, we must keep the // popped size 16-byte aligned. NumBytes = alignTo(NumBytes, 16); // FPDiff will be negative if this tail call requires more space than we // would automatically have in our incoming argument space. Positive if we // can actually shrink the stack. FPDiff = NumReusableBytes - NumBytes; // The stack pointer must be 16-byte aligned at all times it's used for a // memory operation, which in practice means at *all* times and in // particular across call boundaries. Therefore our own arguments started at // a 16-byte aligned SP and the delta applied for the tail call should // satisfy the same constraint. assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); } // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); SmallVector, 8> RegsToPass; SmallVector MemOpChains; auto PtrVT = getPointerTy(DAG.getDataLayout()); // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::AExt: if (Outs[realArgIdx].ArgVT == MVT::i1) { // AAPCS requires i1 to be zero-extended to 8-bits by the caller. Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); } Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; case CCValAssign::FPExt: Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); break; } if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i64) { assert(VA.getLocVT() == MVT::i64 && "unexpected calling convention register assignment"); assert(!Ins.empty() && Ins[0].VT == MVT::i64 && "unexpected use of 'returned'"); IsThisReturn = true; } RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else { assert(VA.isMemLoc()); SDValue DstAddr; MachinePointerInfo DstInfo; // FIXME: This works on big-endian for composite byvals, which are the // common case. It should also work for fundamental types too. uint32_t BEAlign = 0; unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 : VA.getValVT().getSizeInBits(); OpSize = (OpSize + 7) / 8; if (!Subtarget->isLittleEndian() && !Flags.isByVal() && !Flags.isInConsecutiveRegs()) { if (OpSize < 8) BEAlign = 8 - OpSize; } unsigned LocMemOffset = VA.getLocMemOffset(); int32_t Offset = LocMemOffset + BEAlign; SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); if (IsTailCall) { Offset = Offset + FPDiff; int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); DstAddr = DAG.getFrameIndex(FI, PtrVT); DstInfo = MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI); // Make sure any stack arguments overlapping with where we're storing // are loaded before this eventual operation. Otherwise they'll be // clobbered. Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); } else { SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL); DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff); DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset); } if (Outs[i].Flags.isByVal()) { SDValue SizeNode = DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64); SDValue Cpy = DAG.getMemcpy( Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), /*isVol = */ false, /*AlwaysInline = */ false, /*isTailCall = */ false, DstInfo, MachinePointerInfo()); MemOpChains.push_back(Cpy); } else { // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already // promoted to a legal register type i32, we should truncate Arg back to // i1/i8/i16. if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || VA.getValVT() == MVT::i16) Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo); MemOpChains.push_back(Store); } } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; for (auto &RegToPass : RegsToPass) { Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, RegToPass.second, InFlag); InFlag = Chain.getValue(1); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. if (auto *G = dyn_cast(Callee)) { auto GV = G->getGlobal(); if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) == AArch64II::MO_GOT) { Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT); } else { const GlobalValue *GV = G->getGlobal(); Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); } } else if (auto *S = dyn_cast(Callee)) { if (getTargetMachine().getCodeModel() == CodeModel::Large && Subtarget->isTargetMachO()) { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); } else { const char *Sym = S->getSymbol(); Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0); } } // We don't usually want to end the call-sequence here because we would tidy // the frame up *after* the call, however in the ABI-changing tail-call case // we've carefully laid out the parameters so that when sp is reset they'll be // in the correct location. if (IsTailCall && !IsSibCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), DAG.getIntPtrConstant(0, DL, true), InFlag, DL); InFlag = Chain.getValue(1); } std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so // this information must travel along with the operation for eventual // consumption by emitEpilogue. Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); } // Add argument registers to the end of the list so that they are known live // into the call. for (auto &RegToPass : RegsToPass) Ops.push_back(DAG.getRegister(RegToPass.first, RegToPass.second.getValueType())); // Add a register mask operand representing the call-preserved registers. const uint32_t *Mask; const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); if (IsThisReturn) { // For 'this' returns, use the X0-preserving mask if applicable Mask = TRI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { IsThisReturn = false; Mask = TRI->getCallPreservedMask(MF, CallConv); } } else Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) Ops.push_back(InFlag); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); // If we're doing a tall call, use a TC_RETURN here rather than an // actual call instruction. if (IsTailCall) { MF.getFrameInfo().setHasTailCall(); return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); InFlag = Chain.getValue(1); uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0; Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), DAG.getIntPtrConstant(CalleePopBytes, DL, true), InFlag, DL); if (!Ins.empty()) InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, InVals, IsThisReturn, IsThisReturn ? OutVals[0] : SDValue()); } bool AArch64TargetLowering::CanLowerReturn( CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC); } SDValue AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const { CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS : RetCC_AArch64_AAPCS; SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC); // Copy the result values into the output registers. SDValue Flag; SmallVector RetOps(1, Chain); for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[realRVLocIdx]; switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: if (Outs[i].ArgVT == MVT::i1) { // AAPCS requires i1 to be zero-extended to i8 by the producer of the // value. This is strictly redundant on Darwin (which uses "zeroext // i1"), but will be optimised out before ISel. Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); } break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); break; } Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (AArch64::GPR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i64)); else if (AArch64::FPR64RegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); } //===----------------------------------------------------------------------===// // Other Lowering Code //===----------------------------------------------------------------------===// SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, N->getOffset(), Flag); } SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag); } SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(), N->getOffset(), Flag); } SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const { return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag); } // (loadGOT sym) template SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags); // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes instead of using a wrapper node. return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr); } // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym)) template SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); const unsigned char MO_NC = AArch64II::MO_NC; return DAG.getNode( AArch64ISD::WrapperLarge, DL, Ty, getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags), getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags), getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags), getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags)); } // (addlow (adrp %hi(sym)) %lo(sym)) template SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags) const { LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n"); SDLoc DL(N); EVT Ty = getPointerTy(DAG.getDataLayout()); SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags); SDValue Lo = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi); return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo); } SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GN = cast(Op); const GlobalValue *GV = GN->getGlobal(); const AArch64II::TOF TargetFlags = (GV->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT : AArch64II::MO_NO_FLAG); unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); if (OpFlags != AArch64II::MO_NO_FLAG) assert(cast(Op)->getOffset() == 0 && "unexpected offset in global node"); // This also catches the large code model case for Darwin. if ((OpFlags & AArch64II::MO_GOT) != 0) { return getGOT(GN, DAG, TargetFlags); } SDValue Result; if (getTargetMachine().getCodeModel() == CodeModel::Large) { Result = getAddrLarge(GN, DAG, TargetFlags); } else { Result = getAddr(GN, DAG, TargetFlags); } EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(GN); if (GV->hasDLLImportStorageClass()) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } /// Convert a TLS address reference into the correct sequence of loads /// and calls to compute the variable's address (for Darwin, currently) and /// return an SDValue containing the final node. /// Darwin only has one TLS scheme which must be capable of dealing with the /// fully general situation, in the worst case. This means: /// + "extern __thread" declaration. /// + Defined in a possibly unknown dynamic library. /// /// The general system is that each __thread variable has a [3 x i64] descriptor /// which contains information used by the runtime to calculate the address. The /// only part of this the compiler needs to know about is the first xword, which /// contains a function pointer that must be called with the address of the /// entire descriptor in "x0". /// /// Since this descriptor may be in a different unit, in general even the /// descriptor must be accessed via an indirect load. The "ideal" code sequence /// is: /// adrp x0, _var@TLVPPAGE /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, /// ; the function pointer /// blr x1 ; Uses descriptor address in x0 /// ; Address of _var is now in x0. /// /// If the address of _var's descriptor *is* known to the linker, then it can /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for /// a slight efficiency gain. SDValue AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "This function expects a Darwin target"); SDLoc DL(Op); MVT PtrVT = getPointerTy(DAG.getDataLayout()); const GlobalValue *GV = cast(Op)->getGlobal(); SDValue TLVPAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); // The first entry in the descriptor is a function pointer that we must call // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), /* Alignment = */ 8, MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable); Chain = FuncTLVGet.getValue(1); MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setAdjustsStack(true); // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). const uint32_t *Mask = Subtarget->getRegisterInfo()->getTLSCallPreservedMask(); // Finally, we can make the call. This is just a degenerate version of a // normal AArch64 call node: x0 takes the address of the descriptor, and // returns the address of the variable in this thread. Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); Chain = DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), DAG.getRegisterMask(Mask), Chain.getValue(1)); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); } /// When accessing thread-local variables under either the general-dynamic or /// local-dynamic system, we make a "TLS-descriptor" call. The variable will /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry /// is a function pointer to carry out the resolution. /// /// The sequence is: /// adrp x0, :tlsdesc:var /// ldr x1, [x0, #:tlsdesc_lo12:var] /// add x0, x0, #:tlsdesc_lo12:var /// .tlsdesccall var /// blr x1 /// (TPIDR_EL0 offset now in x0) /// /// The above sequence must be produced unscheduled, to enable the linker to /// optimize/relax this sequence. /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the /// above sequence, and expanded really late in the compilation flow, to ensure /// the sequence is produced as per above. SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr}); SDValue Glue = Chain.getValue(1); return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); } SDValue AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "This function expects an ELF target"); assert(Subtarget->useSmallAddressing() && "ELF TLS only supported in small memory model"); // Different choices can be made for the maximum size of the TLS area for a // module. For the small address model, the default TLS size is 16MiB and the // maximum TLS size is 4GiB. // FIXME: add -mtls-size command line option and make it control the 16MiB // vs. 4GiB code sequence generation. const GlobalAddressSDNode *GA = cast(Op); TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); if (!EnableAArch64ELFLocalDynamicTLSGeneration) { if (Model == TLSModel::LocalDynamic) Model = TLSModel::GeneralDynamic; } SDValue TPOff; EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); const GlobalValue *GV = GA->getGlobal(); SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); if (Model == TLSModel::LocalExec) { SDValue HiVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); SDValue TPWithOff_lo = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, HiVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); SDValue TPWithOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, LoVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); return TPWithOff; } else if (Model == TLSModel::InitialExec) { TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); } else if (Model == TLSModel::LocalDynamic) { // Local-dynamic accesses proceed in two phases. A general-dynamic TLS // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate // the beginning of the module's TLS region, followed by a DTPREL offset // calculation. // These accesses will need deduplicating if there's more than one. AArch64FunctionInfo *MFI = DAG.getMachineFunction().getInfo(); MFI->incNumLocalDynamicTLSAccesses(); // The call needs a relocation too for linker relaxation. It doesn't make // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of // the address. SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS); // Now we can calculate the offset from TPIDR_EL0 to this module's // thread-local area. TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); // Now use :dtprel_whatever: operations to calculate this variable's offset // in its thread-storage area. SDValue HiVar = DAG.getTargetGlobalAddress( GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue LoVar = DAG.getTargetGlobalAddress( GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar, DAG.getTargetConstant(0, DL, MVT::i32)), 0); } else if (Model == TLSModel::GeneralDynamic) { // The call needs a relocation too for linker relaxation. It doesn't make // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of // the address. SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); // Finally we can make a call to calculate the offset from tpidr_el0. TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG); } else llvm_unreachable("Unsupported ELF TLS access model"); return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); } SDValue AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); SDValue Chain = DAG.getEntryNode(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64); // Load the ThreadLocalStoragePointer from the TEB // A pointer to the TLS array is located at offset 0x58 from the TEB. SDValue TLSArray = DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL)); TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); Chain = TLSArray.getValue(1); // Load the TLS index from the C runtime; // This does the same as getAddr(), but without having a GlobalAddressSDNode. // This also does the same as LOADgot, but using a generic i32 load, // while LOADgot only loads i64. SDValue TLSIndexHi = DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE); SDValue TLSIndexLo = DAG.getTargetExternalSymbol( "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC); SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi); SDValue TLSIndex = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo); TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo()); Chain = TLSIndex.getValue(1); // The pointer to the thread's TLS data area is at the TLS Index scaled by 8 // offset into the TLSArray. TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex); SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, DAG.getConstant(3, DL, PtrVT)); SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), MachinePointerInfo()); Chain = TLS.getValue(1); const GlobalAddressSDNode *GA = cast(Op); const GlobalValue *GV = GA->getGlobal(); SDValue TGAHi = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); SDValue TGALo = DAG.getTargetGlobalAddress( GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); // Add the offset from the start of the .tls section (section base). SDValue Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi, DAG.getTargetConstant(0, DL, MVT::i32)), 0); Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo); return Addr; } SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { const GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().useEmulatedTLS()) return LowerToTLSEmulatedModel(GA, DAG); if (Subtarget->isTargetDarwin()) return LowerDarwinGlobalTLSAddress(Op, DAG); if (Subtarget->isTargetELF()) return LowerELFGlobalTLSAddress(Op, DAG); if (Subtarget->isTargetWindows()) return LowerWindowsGlobalTLSAddress(Op, DAG); llvm_unreachable("Unexpected platform trying to use TLS"); } SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc dl(Op); // Handle f128 first, since lowering it will result in comparing the return // value of a libcall against zero, which is just what the rest of LowerBR_CC // is expecting to deal with. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) return SDValue(); // The actual operation with overflow check. AArch64CC::CondCode OFCC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); if (CC == ISD::SETNE) OFCC = getInvertedCondCode(OFCC); SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Overflow); } if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); // If the RHS of the comparison is zero, we can potentially fold this // to a specialized branch. const ConstantSDNode *RHSC = dyn_cast(RHS); if (RHSC && RHSC->getZExtValue() == 0) { if (CC == ISD::SETEQ) { // See if we can use a TBZ to fold in an AND as well. // TBZ has a smaller branch displacement than CBZ. If the offset is // out of bounds, a late MI-layer pass rewrites branches. // 403.gcc is an example that hits this case. if (LHS.getOpcode() == ISD::AND && isa(LHS.getOperand(1)) && isPowerOf2_64(LHS.getConstantOperandVal(1))) { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, DAG.getConstant(Log2_64(Mask), dl, MVT::i64), Dest); } return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); } else if (CC == ISD::SETNE) { // See if we can use a TBZ to fold in an AND as well. // TBZ has a smaller branch displacement than CBZ. If the offset is // out of bounds, a late MI-layer pass rewrites branches. // 403.gcc is an example that hits this case. if (LHS.getOpcode() == ISD::AND && isa(LHS.getOperand(1)) && isPowerOf2_64(LHS.getConstantOperandVal(1))) { SDValue Test = LHS.getOperand(0); uint64_t Mask = LHS.getConstantOperandVal(1); return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, DAG.getConstant(Log2_64(Mask), dl, MVT::i64), Dest); } return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. uint64_t Mask = LHS.getValueSizeInBits() - 1; return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, DAG.getConstant(Mask, dl, MVT::i64), Dest); } } if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && LHS.getOpcode() != ISD::AND) { // Don't combine AND since emitComparison converts the AND to an ANDS // (a.k.a. TST) and the test in the test bit and branch instruction // becomes redundant. This would also increase register pressure. uint64_t Mask = LHS.getValueSizeInBits() - 1; return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, DAG.getConstant(Mask, dl, MVT::i64), Dest); } SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); } assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue BR1 = DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); if (CC2 != AArch64CC::AL) { SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, Cmp); } return BR1; } SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue In1 = Op.getOperand(0); SDValue In2 = Op.getOperand(1); EVT SrcVT = In2.getValueType(); if (SrcVT.bitsLT(VT)) In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); else if (SrcVT.bitsGT(VT)) In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL)); EVT VecVT; uint64_t EltMask; SDValue VecVal1, VecVal2; auto setVecVal = [&] (int Idx) { if (!VT.isVector()) { VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1); VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2); } else { VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); } }; if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32); EltMask = 0x80000000ULL; setVecVal(AArch64::ssub); } else if (VT == MVT::f64 || VT == MVT::v2f64) { VecVT = MVT::v2i64; // We want to materialize a mask with the high bit set, but the AdvSIMD // immediate moves cannot materialize that in a single instruction for // 64-bit elements. Instead, materialize zero and then negate it. EltMask = 0; setVecVal(AArch64::dsub); } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) { VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16); EltMask = 0x8000ULL; setVecVal(AArch64::hsub); } else { llvm_unreachable("Invalid type for copysign!"); } SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT); // If we couldn't materialize the mask above, then the mask vector will be // the zero vector, and we need to negate it here. if (VT == MVT::f64 || VT == MVT::v2f64) { BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); } SDValue Sel = DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); if (VT == MVT::f16) return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel); if (VT == MVT::f32) return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); else if (VT == MVT::f64) return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); else return DAG.getNode(ISD::BITCAST, DL, VT, Sel); } SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { if (DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat)) return SDValue(); if (!Subtarget->hasNEON()) return SDValue(); // While there is no integer popcount instruction, it can // be more efficiently lowered to the following sequence that uses // AdvSIMD registers/instructions as long as the copies to/from // the AdvSIMD registers are cheap. // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd // CNT V0.8B, V0.8B // 8xbyte pop-counts // ADDV B0, V0.8B // sum 8xbyte pop-counts // UMOV X0, V0.B[0] // copy byte result back to integer reg SDValue Val = Op.getOperand(0); SDLoc DL(Op); EVT VT = Op.getValueType(); if (VT == MVT::i32) Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val); Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val); SDValue UaddLV = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop); if (VT == MVT::i64) UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); return UaddLV; } SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDLoc dl(Op); // We chose ZeroOrOneBooleanContents, so use zero and one. EVT VT = Op.getValueType(); SDValue TVal = DAG.getConstant(1, dl, VT); SDValue FVal = DAG.getConstant(0, dl, VT); // Handle f128 first, since one possible outcome is a normal integer // comparison which gets picked up by the next if statement. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); // If softenSetCCOperands returned a scalar, use it. if (!RHS.getNode()) { assert(LHS.getValueType() == Op.getValueType() && "Unexpected setcc expansion!"); return LHS; } } if (LHS.getValueType().isInteger()) { SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); } // Now we know we're dealing with FP values. assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead // and do the comparison. SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); if (CC2 == AArch64CC::AL) { changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be // matched to a single CSINC instruction. return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); } else { // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't // totally clean. Some of them require two CSELs to implement. As is in // this case, we emit the first CSEL and then emit a second using the output // of the first as the RHS. We're effectively OR'ing the two CC's together. // FIXME: It would be nice if we could match the two CSELs to two CSINCs. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } } SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const { // Handle f128 first, because it will result in a comparison of some RTLIB // call result against zero. if (LHS.getValueType() == MVT::f128) { softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); // If softenSetCCOperands returned a scalar, we need to compare the result // against zero to select between true and false values. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } // Also handle f16, for which we need to do a f32 comparison. if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS); } // Next, handle integers. if (LHS.getValueType().isInteger()) { assert((LHS.getValueType() == RHS.getValueType()) && (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); unsigned Opcode = AArch64ISD::CSEL; // If both the TVal and the FVal are constants, see if we can swap them in // order to for a CSINV or CSINC out of them. ConstantSDNode *CFVal = dyn_cast(FVal); ConstantSDNode *CTVal = dyn_cast(TVal); if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } else if (TVal.getOpcode() == ISD::XOR) { // If TVal is a NOT we want to swap TVal and FVal so that we can match // with a CSINV rather than a CSEL. if (isAllOnesConstant(TVal.getOperand(1))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } } else if (TVal.getOpcode() == ISD::SUB) { // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so // that we can match with a CSNEG rather than a CSEL. if (isNullConstant(TVal.getOperand(0))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } } else if (CTVal && CFVal) { const int64_t TrueVal = CTVal->getSExtValue(); const int64_t FalseVal = CFVal->getSExtValue(); bool Swap = false; // If both TVal and FVal are constants, see if FVal is the // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC // instead of a CSEL in that case. if (TrueVal == ~FalseVal) { Opcode = AArch64ISD::CSINV; } else if (TrueVal == -FalseVal) { Opcode = AArch64ISD::CSNEG; } else if (TVal.getValueType() == MVT::i32) { // If our operands are only 32-bit wide, make sure we use 32-bit // arithmetic for the check whether we can use CSINC. This ensures that // the addition in the check will wrap around properly in case there is // an overflow (which would not be the case if we do the check with // 64-bit arithmetic). const uint32_t TrueVal32 = CTVal->getZExtValue(); const uint32_t FalseVal32 = CFVal->getZExtValue(); if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { Opcode = AArch64ISD::CSINC; if (TrueVal32 > FalseVal32) { Swap = true; } } // 64-bit check whether we can use CSINC. } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { Opcode = AArch64ISD::CSINC; if (TrueVal > FalseVal) { Swap = true; } } // Swap TVal and FVal if necessary. if (Swap) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); CC = ISD::getSetCCInverse(CC, true); } if (Opcode != AArch64ISD::CSEL) { // Drop FVal since we can get its value by simply inverting/negating // TVal. FVal = TVal; } } // Avoid materializing a constant when possible by reusing a known value in // a register. However, don't perform this optimization if the known value // is one, zero or negative one in the case of a CSEL. We can always // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the // FVal, respectively. ConstantSDNode *RHSVal = dyn_cast(RHS); if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() && !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) { AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to // "a != C ? x : a" to avoid materializing C. if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ) TVal = LHS; else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE) FVal = LHS; } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) { assert (CTVal && CFVal && "Expected constant operands for CSNEG."); // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to // avoid materializing C. AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) { Opcode = AArch64ISD::CSINV; TVal = LHS; FVal = DAG.getConstant(0, dl, FVal.getValueType()); } } SDValue CCVal; SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); EVT VT = TVal.getValueType(); return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); } // Now we know we're dealing with FP values. assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); assert(LHS.getValueType() == RHS.getValueType()); EVT VT = TVal.getValueType(); SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two CSELs to implement. AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); if (DAG.getTarget().Options.UnsafeFPMath) { // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0. ConstantFPSDNode *RHSVal = dyn_cast(RHS); if (RHSVal && RHSVal->isZero()) { ConstantFPSDNode *CFVal = dyn_cast(FVal); ConstantFPSDNode *CTVal = dyn_cast(TVal); if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) && CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType()) TVal = LHS; else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) && CFVal && CFVal->isZero() && FVal.getValueType() == LHS.getValueType()) FVal = LHS; } } // Emit first, and possibly only, CSEL. SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); // If we need a second CSEL, emit it, using the output of the first as the // RHS. We're effectively OR'ing the two CC's together. if (CC2 != AArch64CC::AL) { SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); } // Otherwise, return the output of the first CSEL. return CS1; } SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue TVal = Op.getOperand(2); SDValue FVal = Op.getOperand(3); SDLoc DL(Op); return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); } SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue CCVal = Op->getOperand(0); SDValue TVal = Op->getOperand(1); SDValue FVal = Op->getOperand(2); SDLoc DL(Op); // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select // instruction. if (isOverflowIntrOpRes(CCVal)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0))) return SDValue(); AArch64CC::CondCode OFCC; SDValue Value, Overflow; std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG); SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32); return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal, Overflow); } // Lower it the same way as we would lower a SELECT_CC node. ISD::CondCode CC; SDValue LHS, RHS; if (CCVal.getOpcode() == ISD::SETCC) { LHS = CCVal.getOperand(0); RHS = CCVal.getOperand(1); CC = cast(CCVal->getOperand(2))->get(); } else { LHS = CCVal; RHS = DAG.getConstant(0, DL, CCVal.getValueType()); CC = ISD::SETNE; } return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG); } SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { // Jump table entries as PC relative offsets. No additional tweaking // is necessary here. Just get the address of the jump table. JumpTableSDNode *JT = cast(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { return getAddrLarge(JT, DAG); } return getAddr(JT, DAG); } SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { ConstantPoolSDNode *CP = cast(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large) { // Use the GOT for the large code model on iOS. if (Subtarget->isTargetMachO()) { return getGOT(CP, DAG); } return getAddrLarge(CP, DAG); } else { return getAddr(CP, DAG); } } SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { BlockAddressSDNode *BA = cast(Op); if (getTargetMachine().getCodeModel() == CodeModel::Large && !Subtarget->isTargetMachO()) { return getAddrLarge(BA, DAG); } else { return getAddr(BA, DAG); } } SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const { AArch64FunctionInfo *FuncInfo = DAG.getMachineFunction().getInfo(); SDLoc DL(Op); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy(DAG.getDataLayout())); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); } SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const { AArch64FunctionInfo *FuncInfo = DAG.getMachineFunction().getInfo(); SDLoc DL(Op); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 ? FuncInfo->getVarArgsGPRIndex() : FuncInfo->getVarArgsStackIndex(), getPointerTy(DAG.getDataLayout())); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), MachinePointerInfo(SV)); } SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const { // The layout of the va_list struct is specified in the AArch64 Procedure Call // Standard, section B.3. MachineFunction &MF = DAG.getMachineFunction(); AArch64FunctionInfo *FuncInfo = MF.getInfo(); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue VAList = Op.getOperand(1); const Value *SV = cast(Op.getOperand(2))->getValue(); SmallVector MemOps; // void *__stack at offset 0 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT); MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, MachinePointerInfo(SV), /* Alignment = */ 8)); // void *__gr_top at offset 8 int GPRSize = FuncInfo->getVarArgsGPRSize(); if (GPRSize > 0) { SDValue GRTop, GRTopAddr; GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT)); GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT); GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop, DAG.getConstant(GPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, MachinePointerInfo(SV, 8), /* Alignment = */ 8)); } // void *__vr_top at offset 16 int FPRSize = FuncInfo->getVarArgsFPRSize(); if (FPRSize > 0) { SDValue VRTop, VRTopAddr; VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(16, DL, PtrVT)); VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT); VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop, DAG.getConstant(FPRSize, DL, PtrVT)); MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, MachinePointerInfo(SV, 16), /* Alignment = */ 8)); } // int __gr_offs at offset 24 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT)); MemOps.push_back(DAG.getStore( Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr, MachinePointerInfo(SV, 24), /* Alignment = */ 4)); // int __vr_offs at offset 28 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT)); MemOps.push_back(DAG.getStore( Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr, MachinePointerInfo(SV, 28), /* Alignment = */ 4)); return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); } SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv())) return LowerWin64_VASTART(Op, DAG); else if (Subtarget->isTargetDarwin()) return LowerDarwin_VASTART(Op, DAG); else return LowerAAPCS_VASTART(Op, DAG); } SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const { // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); unsigned VaListSize = Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32; const Value *DestSV = cast(Op.getOperand(3))->getValue(); const Value *SrcSV = cast(Op.getOperand(4))->getValue(); return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2), DAG.getConstant(VaListSize, DL, MVT::i32), 8, false, false, false, MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV)); } SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "automatic va_arg instruction only works on Darwin"); const Value *V = cast(Op.getOperand(2))->getValue(); EVT VT = Op.getValueType(); SDLoc DL(Op); SDValue Chain = Op.getOperand(0); SDValue Addr = Op.getOperand(1); unsigned Align = Op.getConstantOperandVal(3); auto PtrVT = getPointerTy(DAG.getDataLayout()); SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V)); Chain = VAList.getValue(1); if (Align > 8) { assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(Align - 1, DL, PtrVT)); VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList, DAG.getConstant(-(int64_t)Align, DL, PtrVT)); } Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); // Scalar integer and FP values smaller than 64 bits are implicitly extended // up to 64 bits. At the very least, we have to increase the striding of the // vaargs list to match this, and for FP values we need to introduce // FP_ROUND nodes as well. if (VT.isInteger() && !VT.isVector()) ArgSize = 8; bool NeedFPTrunc = false; if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { ArgSize = 8; NeedFPTrunc = true; } // Increment the pointer, VAList, to the next vaarg SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(ArgSize, DL, PtrVT)); // Store the incremented VAList to the legalized pointer SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V)); // Load the actual argument out of the pointer VAList if (NeedFPTrunc) { // Load the value as an f64. SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo()); // Round the value down to an f32. SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), DAG.getIntPtrConstant(1, DL)); SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; // Merge the rounded value with the chain output of the load. return DAG.getMergeValues(Ops, DL); } return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo()); } SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); MFI.setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch(RegName) .Case("sp", AArch64::SP) .Case("x18", AArch64::X18) .Case("w18", AArch64::W18) .Case("x20", AArch64::X20) .Case("w20", AArch64::W20) .Default(0); if (((Reg == AArch64::X18 || Reg == AArch64::W18) && !Subtarget->isX18Reserved()) || ((Reg == AArch64::X20 || Reg == AArch64::W20) && !Subtarget->isX20Reserved())) Reg = 0; if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" + StringRef(RegName) + "\".")); } SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setReturnAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc DL(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout())); return DAG.getLoad(VT, DL, DAG.getEntryNode(), DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo()); } // Return LR, which contains the return address. Mark it an implicit live-in. unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which // is "undef". We wanted 0, so CSEL it directly. SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETEQ, dl, DAG); SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); HiBitsForLo = DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), HiBitsForLo, CCVal, Cmp); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); SDValue LoForNormalShift = DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo); Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, dl, DAG); CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, LoForNormalShift, CCVal, Cmp); // AArch64 shifts larger than the register width are wrapped rather than // clamped, so we can't just emit "hi >> x". SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); SDValue HiForBigShift = Opc == ISD::SRA ? DAG.getNode(Opc, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, dl, MVT::i64)) : DAG.getConstant(0, dl, VT); SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, HiForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i64 values and take a 2 x i64 value to shift plus a shift amount. SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, DAG.getConstant(VTBits, dl, MVT::i64), ShAmt); SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which // is "undef". We wanted 0, so CSEL it directly. SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETEQ, dl, DAG); SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32); LoBitsForHi = DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64), LoBitsForHi, CCVal, Cmp); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, DAG.getConstant(VTBits, dl, MVT::i64)); SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); SDValue HiForNormalShift = DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi); SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE, dl, DAG); CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32); SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift, HiForNormalShift, CCVal, Cmp); // AArch64 shifts of larger than register sizes are wrapped rather than // clamped, so we can't just emit "lo << a" if a is too big. SDValue LoForBigShift = DAG.getConstant(0, dl, VT); SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift, LoForNormalShift, CCVal, Cmp); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } bool AArch64TargetLowering::isOffsetFoldingLegal( const GlobalAddressSDNode *GA) const { // Offsets are folded in the DAG combine rather than here so that we can // intelligently choose an offset based on the uses. return false; } bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. // FIXME: We should be able to handle f128 as well with a clever lowering. if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasFullFP16()))) { LLVM_DEBUG( dbgs() << "Legal fp imm: materialize 0 using the zero register\n"); return true; } StringRef FPType; bool IsLegal = false; SmallString<128> ImmStrVal; Imm.toString(ImmStrVal); if (VT == MVT::f64) { FPType = "f64"; IsLegal = AArch64_AM::getFP64Imm(Imm) != -1; } else if (VT == MVT::f32) { FPType = "f32"; IsLegal = AArch64_AM::getFP32Imm(Imm) != -1; } else if (VT == MVT::f16 && Subtarget->hasFullFP16()) { FPType = "f16"; IsLegal = AArch64_AM::getFP16Imm(Imm) != -1; } if (IsLegal) { LLVM_DEBUG(dbgs() << "Legal " << FPType << " imm value: " << ImmStrVal << "\n"); return true; } if (!FPType.empty()) LLVM_DEBUG(dbgs() << "Illegal " << FPType << " imm value: " << ImmStrVal << "\n"); else LLVM_DEBUG(dbgs() << "Illegal fp imm " << ImmStrVal << ": unsupported fp type\n"); return false; } //===----------------------------------------------------------------------===// // AArch64 Optimization Hooks //===----------------------------------------------------------------------===// static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode, SDValue Operand, SelectionDAG &DAG, int &ExtraSteps) { EVT VT = Operand.getValueType(); if (ST->hasNEON() && (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 || VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 || VT == MVT::v4f32)) { if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified) // For the reciprocal estimates, convergence is quadratic, so the number // of digits is doubled after each iteration. In ARMv8, the accuracy of // the initial estimate is 2^-8. Thus the number of extra steps to refine // the result for float (23 mantissa bits) is 2 and for double (52 // mantissa bits) is 3. ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2; return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand); } return SDValue(); } SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const { if (Enabled == ReciprocalEstimate::Enabled || (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt())) if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand, DAG, ExtraSteps)) { SDLoc DL(Operand); EVT VT = Operand.getValueType(); SDNodeFlags Flags; Flags.setAllowReassociation(true); // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2) // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N) for (int i = ExtraSteps; i > 0; --i) { SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate, Flags); Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } if (!Reciprocal) { EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); SDValue FPZero = DAG.getConstantFP(0.0, DL, VT); SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags); // Correct the result if the operand is 0.0. Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, Eq, Operand, Estimate); } ExtraSteps = 0; return Estimate; } return SDValue(); } SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps) const { if (Enabled == ReciprocalEstimate::Enabled) if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand, DAG, ExtraSteps)) { SDLoc DL(Operand); EVT VT = Operand.getValueType(); SDNodeFlags Flags; Flags.setAllowReassociation(true); // Newton reciprocal iteration: E * (2 - X * E) // AArch64 reciprocal iteration instruction: (2 - M * N) for (int i = ExtraSteps; i > 0; --i) { SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand, Estimate, Flags); Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags); } ExtraSteps = 0; return Estimate; } return SDValue(); } //===----------------------------------------------------------------------===// // AArch64 Inline Assembly Support //===----------------------------------------------------------------------===// // Table of Constraints // TODO: This is the current set of constraints supported by ARM for the // compiler, not all of them may make sense. // // r - A general register // w - An FP/SIMD register of some size in the range v0-v31 // x - An FP/SIMD register of some size in the range v0-v15 // I - Constant that can be used with an ADD instruction // J - Constant that can be used with a SUB instruction // K - Constant that can be used with a 32-bit logical instruction // L - Constant that can be used with a 64-bit logical instruction // M - Constant that can be used as a 32-bit MOV immediate // N - Constant that can be used as a 64-bit MOV immediate // Q - A memory reference with base register and no offset // S - A symbolic address // Y - Floating point constant zero // Z - Integer constant zero // // Note that general register operands will be output using their 64-bit x // register name, whatever the size of the variable, unless the asm operand // is prefixed by the %w modifier. Floating-point and SIMD register operands // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or // %q modifier. const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const { // At this point, we have to lower this constraint to something else, so we // lower it to an "r" or "w". However, by doing this we will force the result // to be in register, while the X constraint is much more permissive. // // Although we are correct (we are free to emit anything, without // constraints), we might break use cases that would expect us to be more // efficient and emit something else. if (!Subtarget->hasFPARMv8()) return "r"; if (ConstraintVT.isFloatingPoint()) return "w"; if (ConstraintVT.isVector() && (ConstraintVT.getSizeInBits() == 64 || ConstraintVT.getSizeInBits() == 128)) return "w"; return "r"; } /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. AArch64TargetLowering::ConstraintType AArch64TargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; case 'z': return C_Other; case 'x': case 'w': return C_RegisterClass; // An address with a single base register. Due to the way we // currently handle addresses it is the same as 'r'. case 'Q': return C_Memory; case 'S': // A symbolic address return C_Other; } } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight AArch64TargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); break; case 'x': case 'w': if (type->isFloatingPointTy() || type->isVectorTy()) weight = CW_Register; break; case 'z': weight = CW_Constant; break; } return weight; } std::pair AArch64TargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { switch (Constraint[0]) { case 'r': if (VT.getSizeInBits() == 64) return std::make_pair(0U, &AArch64::GPR64commonRegClass); return std::make_pair(0U, &AArch64::GPR32commonRegClass); case 'w': if (VT.getSizeInBits() == 16) return std::make_pair(0U, &AArch64::FPR16RegClass); if (VT.getSizeInBits() == 32) return std::make_pair(0U, &AArch64::FPR32RegClass); if (VT.getSizeInBits() == 64) return std::make_pair(0U, &AArch64::FPR64RegClass); if (VT.getSizeInBits() == 128) return std::make_pair(0U, &AArch64::FPR128RegClass); break; // The instructions that this constraint is designed for can // only take 128-bit registers so just use that regclass. case 'x': if (VT.getSizeInBits() == 128) return std::make_pair(0U, &AArch64::FPR128_loRegClass); break; } } if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); // Use the default implementation in TargetLowering to convert the register // constraint into a member of a register class. std::pair Res; Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); // Not found as a standard register? if (!Res.second) { unsigned Size = Constraint.size(); if ((Size == 4 || Size == 5) && Constraint[0] == '{' && tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { int RegNo; bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo); if (!Failed && RegNo >= 0 && RegNo <= 31) { // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size. // By default we'll emit v0-v31 for this unless there's a modifier where // we'll emit the correct register as well. if (VT != MVT::Other && VT.getSizeInBits() == 64) { Res.first = AArch64::FPR64RegClass.getRegister(RegNo); Res.second = &AArch64::FPR64RegClass; } else { Res.first = AArch64::FPR128RegClass.getRegister(RegNo); Res.second = &AArch64::FPR128RegClass; } } } } return Res; } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void AArch64TargetLowering::LowerAsmOperandForConstraint( SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const { SDValue Result; // Currently only support length 1 constraints. if (Constraint.length() != 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; // This set of constraints deal with valid constants for various instructions. // Validate and return a target constant for them if we can. case 'z': { // 'z' maps to xzr or wzr so it needs an input of 0. if (!isNullConstant(Op)) return; if (Op.getValueType() == MVT::i64) Result = DAG.getRegister(AArch64::XZR, MVT::i64); else Result = DAG.getRegister(AArch64::WZR, MVT::i32); break; } case 'S': { // An absolute symbolic address or label reference. if (const GlobalAddressSDNode *GA = dyn_cast(Op)) { Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op), GA->getValueType(0)); } else if (const BlockAddressSDNode *BA = dyn_cast(Op)) { Result = DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0)); } else if (const ExternalSymbolSDNode *ES = dyn_cast(Op)) { Result = DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0)); } else return; break; } case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': ConstantSDNode *C = dyn_cast(Op); if (!C) return; // Grab the value and do some validation. uint64_t CVal = C->getZExtValue(); switch (ConstraintLetter) { // The I constraint applies only to simple ADD or SUB immediate operands: // i.e. 0 to 4095 with optional shift by 12 // The J constraint applies only to ADD or SUB immediates that would be // valid when negated, i.e. if [an add pattern] were to be output as a SUB // instruction [or vice versa], in other words -1 to -4095 with optional // left shift by 12. case 'I': if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) break; return; case 'J': { uint64_t NVal = -C->getSExtValue(); if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { CVal = C->getSExtValue(); break; } return; } // The K and L constraints apply *only* to logical immediates, including // what used to be the MOVI alias for ORR (though the MOVI alias has now // been removed and MOV should be used). So these constraints have to // distinguish between bit patterns that are valid 32-bit or 64-bit // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice // versa. case 'K': if (AArch64_AM::isLogicalImmediate(CVal, 32)) break; return; case 'L': if (AArch64_AM::isLogicalImmediate(CVal, 64)) break; return; // The M and N constraints are a superset of K and L respectively, for use // with the MOV (immediate) alias. As well as the logical immediates they // also match 32 or 64-bit immediates that can be loaded either using a // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca // (M) or 64-bit 0x1234000000000000 (N) etc. // As a note some of this code is liberally stolen from the asm parser. case 'M': { if (!isUInt<32>(CVal)) return; if (AArch64_AM::isLogicalImmediate(CVal, 32)) break; if ((CVal & 0xFFFF) == CVal) break; if ((CVal & 0xFFFF0000ULL) == CVal) break; uint64_t NCVal = ~(uint32_t)CVal; if ((NCVal & 0xFFFFULL) == NCVal) break; if ((NCVal & 0xFFFF0000ULL) == NCVal) break; return; } case 'N': { if (AArch64_AM::isLogicalImmediate(CVal, 64)) break; if ((CVal & 0xFFFFULL) == CVal) break; if ((CVal & 0xFFFF0000ULL) == CVal) break; if ((CVal & 0xFFFF00000000ULL) == CVal) break; if ((CVal & 0xFFFF000000000000ULL) == CVal) break; uint64_t NCVal = ~CVal; if ((NCVal & 0xFFFFULL) == NCVal) break; if ((NCVal & 0xFFFF0000ULL) == NCVal) break; if ((NCVal & 0xFFFF00000000ULL) == NCVal) break; if ((NCVal & 0xFFFF000000000000ULL) == NCVal) break; return; } default: return; } // All assembler immediates are 64-bit integers. Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64); break; } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } //===----------------------------------------------------------------------===// // AArch64 Advanced SIMD Support //===----------------------------------------------------------------------===// /// WidenVector - Given a value in the V64 register class, produce the /// equivalent value in the V128 register class. static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { EVT VT = V64Reg.getValueType(); unsigned NarrowSize = VT.getVectorNumElements(); MVT EltTy = VT.getVectorElementType().getSimpleVT(); MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); SDLoc DL(V64Reg); return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), V64Reg, DAG.getConstant(0, DL, MVT::i32)); } /// getExtFactor - Determine the adjustment factor for the position when /// generating an "extract from vector registers" instruction. static unsigned getExtFactor(SDValue &V) { EVT EltType = V.getValueType().getVectorElementType(); return EltType.getSizeInBits() / 8; } /// NarrowVector - Given a value in the V128 register class, produce the /// equivalent value in the V64 register class. static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { EVT VT = V128Reg.getValueType(); unsigned WideSize = VT.getVectorNumElements(); MVT EltTy = VT.getVectorElementType().getSimpleVT(); MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); SDLoc DL(V128Reg); return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); } // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { SDValue Vec; unsigned MinElt; unsigned MaxElt; // We may insert some combination of BITCASTs and VEXT nodes to force Vec to // be compatible with the shuffle we intend to construct. As a result // ShuffleVec will be some sliding window into the original Vec. SDValue ShuffleVec; // Code should guarantee that element i in Vec starts at element "WindowBase // + i * WindowScale in ShuffleVec". int WindowBase; int WindowScale; ShuffleSourceInfo(SDValue Vec) : Vec(Vec), MinElt(std::numeric_limits::max()), MaxElt(0), ShuffleVec(Vec), WindowBase(0), WindowScale(1) {} bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } }; // First gather all vectors used as an immediate source for this BUILD_VECTOR // node. SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa(V.getOperand(1))) { LLVM_DEBUG( dbgs() << "Reshuffle failed: " "a shuffle can only come from building a vector from " "various elements of other vectors, provided their " "indices are constant\n"); return SDValue(); } // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); auto Source = find(Sources, SourceVec); if (Source == Sources.end()) Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); // Update the minimum and maximum lane number seen. unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); Source->MinElt = std::min(Source->MinElt, EltNo); Source->MaxElt = std::max(Source->MaxElt, EltNo); } if (Sources.size() > 2) { LLVM_DEBUG( dbgs() << "Reshuffle failed: currently only do something sane when at " "most two source vectors are involved\n"); return SDValue(); } // Find out the smallest element size among result and two sources, and use // it as element size to build the shuffle_vector. EVT SmallestEltTy = VT.getVectorElementType(); for (auto &Source : Sources) { EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); if (SrcEltTy.bitsLT(SmallestEltTy)) { SmallestEltTy = SrcEltTy; } } unsigned ResMultiplier = VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); // If the source vector is too wide or too narrow, we may nevertheless be able // to construct a compatible shuffle either by concatenating it with UNDEF or // extracting a suitable range of elements. for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); if (SrcVT.getSizeInBits() == VT.getSizeInBits()) continue; // This stage of the search produces a source with the same element type as // the original, but with a total width matching the BUILD_VECTOR output. EVT EltVT = SrcVT.getVectorElementType(); unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); // We can pad out the smaller vector for free, so if it's part of a // shuffle... Src.ShuffleVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; } assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); if (Src.MaxElt - Src.MinElt >= NumSrcElts) { LLVM_DEBUG( dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n"); return SDValue(); } if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i64)); Src.WindowBase = -NumSrcElts; } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i64)); } else { // An actual VEXT is needed SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i64)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i64)); unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, VEXTSrc2, DAG.getConstant(Imm, dl, MVT::i32)); Src.WindowBase = -Src.MinElt; } } // Another possible incompatibility occurs from the vector element types. We // can fix this by bitcasting the source vectors to the same type we intend // for the shuffle. for (auto &Src : Sources) { EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); Src.WindowBase *= Src.WindowScale; } // Final sanity check before we try to actually produce a shuffle. LLVM_DEBUG(for (auto Src : Sources) assert(Src.ShuffleVec.getValueType() == ShuffleVT);); // The stars all align, our next step is to produce the mask for the shuffle. SmallVector Mask(ShuffleVT.getVectorNumElements(), -1); int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); if (Entry.isUndef()) continue; auto Src = find(Sources, Entry.getOperand(0)); int EltNo = cast(Entry.getOperand(1))->getSExtValue(); // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); int BitsDefined = std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; // This source is expected to fill ResMultiplier lanes of the final shuffle, // starting at the appropriate offset. int *LaneMask = &Mask[i * ResMultiplier]; int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; ExtractBase += NumElts * (Src - Sources.begin()); for (int j = 0; j < LanesDefined; ++j) LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... if (!isShuffleMaskLegal(Mask, ShuffleVT)) { LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n"); return SDValue(); } SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; for (unsigned i = 0; i < Sources.size(); ++i) ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], ShuffleOps[1], Mask); SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump(); dbgs() << "Reshuffle, creating node: "; V.dump();); return V; } // check if an EXT instruction can handle the shuffle mask when the // vector sources of the shuffle are the same. static bool isSingletonEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); // Assume that the first shuffle index is not UNDEF. Fail if it is. if (M[0] < 0) return false; Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first // element. The other shuffle indices must be the successive elements after // the first one. unsigned ExpectedElt = Imm; for (unsigned i = 1; i < NumElts; ++i) { // Increment the expected index. If it wraps around, just follow it // back to index zero and keep going. ++ExpectedElt; if (ExpectedElt == NumElts) ExpectedElt = 0; if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } return true; } // check if an EXT instruction can handle the shuffle mask when the // vector sources of the shuffle are different. static bool isEXTMask(ArrayRef M, EVT VT, bool &ReverseEXT, unsigned &Imm) { // Look for the first non-undef element. const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; }); // Benefit form APInt to handle overflow when calculating expected element. unsigned NumElts = VT.getVectorNumElements(); unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); // The following shuffle indices must be the successive elements after the // first real element. const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); if (FirstWrongElt != M.end()) return false; // The index of an EXT is the first element if it is not UNDEF. // Watch out for the beginning UNDEFs. The EXT index should be the expected // value of the first element. E.g. // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. // ExpectedElt is the last mask index plus 1. Imm = ExpectedElt.getZExtValue(); // There are two difference cases requiring to reverse input vectors. // For example, for vector <4 x i32> we have the following cases, // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) // For both cases, we finally use mask <5, 6, 7, 0>, which requires // to reverse two input vectors. if (Imm < NumElts) ReverseEXT = true; else Imm -= NumElts; return true; } /// isREVMask - Check if a vector shuffle corresponds to a REV /// instruction with the specified blocksize. (The order of the elements /// within each block of the vector is reversed.) static bool isREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && "Only possible block sizes for REV are: 16, 32, 64"); unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); unsigned BlockElts = M[0] + 1; // If the first shuffle index is UNDEF, be optimistic. if (M[0] < 0) BlockElts = BlockSize / EltSz; if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) return false; for (unsigned i = 0; i < NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) return false; } return true; } static bool isZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != Idx) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) return false; Idx += 1; } return true; } static bool isUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i != NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned)M[i] != 2 * i + WhichResult) return false; } return true; } static bool isTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) return false; } return true; } /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. static bool isZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != Idx) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) return false; Idx += 1; } return true; } /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, static bool isUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned Half = VT.getVectorNumElements() / 2; WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned j = 0; j != 2; ++j) { unsigned Idx = WhichResult; for (unsigned i = 0; i != Half; ++i) { int MIdx = M[i + j * Half]; if (MIdx >= 0 && (unsigned)MIdx != Idx) return false; Idx += 2; } } return true; } /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. static bool isTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned NumElts = VT.getVectorNumElements(); WhichResult = (M[0] == 0 ? 0 : 1); for (unsigned i = 0; i < NumElts; i += 2) { if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) return false; } return true; } static bool isINSMask(ArrayRef M, int NumInputElements, bool &DstIsLeft, int &Anomaly) { if (M.size() != static_cast(NumInputElements)) return false; int NumLHSMatch = 0, NumRHSMatch = 0; int LastLHSMismatch = -1, LastRHSMismatch = -1; for (int i = 0; i < NumInputElements; ++i) { if (M[i] == -1) { ++NumLHSMatch; ++NumRHSMatch; continue; } if (M[i] == i) ++NumLHSMatch; else LastLHSMismatch = i; if (M[i] == i + NumInputElements) ++NumRHSMatch; else LastRHSMismatch = i; } if (NumLHSMatch == NumInputElements - 1) { DstIsLeft = true; Anomaly = LastLHSMismatch; return true; } else if (NumRHSMatch == NumInputElements - 1) { DstIsLeft = false; Anomaly = LastRHSMismatch; return true; } return false; } static bool isConcatMask(ArrayRef Mask, EVT VT, bool SplitLHS) { if (VT.getSizeInBits() != 128) return false; unsigned NumElts = VT.getVectorNumElements(); for (int I = 0, E = NumElts / 2; I != E; I++) { if (Mask[I] != I) return false; } int Offset = NumElts / 2; for (int I = NumElts / 2, E = NumElts; I != E; I++) { if (Mask[I] != I + SplitLHS * Offset) return false; } return true; } static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue V0 = Op.getOperand(0); SDValue V1 = Op.getOperand(1); ArrayRef Mask = cast(Op)->getMask(); if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || VT.getVectorElementType() != V1.getValueType().getVectorElementType()) return SDValue(); bool SplitV0 = V0.getValueSizeInBits() == 128; if (!isConcatMask(Mask, VT, SplitV0)) return SDValue(); EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), VT.getVectorNumElements() / 2); if (SplitV0) { V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, DAG.getConstant(0, DL, MVT::i64)); } if (V1.getValueSizeInBits() == 128) { V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, DAG.getConstant(0, DL, MVT::i64)); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); enum { OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> OP_VREV, OP_VDUP0, OP_VDUP1, OP_VDUP2, OP_VDUP3, OP_VEXT1, OP_VEXT2, OP_VEXT3, OP_VUZPL, // VUZP, left result OP_VUZPR, // VUZP, right result OP_VZIPL, // VZIP, left result OP_VZIPR, // VZIP, right result OP_VTRNL, // VTRN, left result OP_VTRNR // VTRN, right result }; if (OpNum == OP_COPY) { if (LHSID == (1 * 9 + 2) * 9 + 3) return LHS; assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); return RHS; } SDValue OpLHS, OpRHS; OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); EVT VT = OpLHS.getValueType(); switch (OpNum) { default: llvm_unreachable("Unknown shuffle opcode!"); case OP_VREV: // VREV divides the vector in half and swaps within the half. if (VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::f32) return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); // vrev <4 x i16> -> REV32 if (VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::f16) return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); // vrev <4 x i8> -> REV16 assert(VT.getVectorElementType() == MVT::i8); return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); case OP_VDUP0: case OP_VDUP1: case OP_VDUP2: case OP_VDUP3: { EVT EltTy = VT.getVectorElementType(); unsigned Opcode; if (EltTy == MVT::i8) Opcode = AArch64ISD::DUPLANE8; else if (EltTy == MVT::i16 || EltTy == MVT::f16) Opcode = AArch64ISD::DUPLANE16; else if (EltTy == MVT::i32 || EltTy == MVT::f32) Opcode = AArch64ISD::DUPLANE32; else if (EltTy == MVT::i64 || EltTy == MVT::f64) Opcode = AArch64ISD::DUPLANE64; else llvm_unreachable("Invalid vector element type?"); if (VT.getSizeInBits() == 64) OpLHS = WidenVector(OpLHS, DAG); SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64); return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); } case OP_VEXT1: case OP_VEXT2: case OP_VEXT3: { unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, DAG.getConstant(Imm, dl, MVT::i32)); } case OP_VUZPL: return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VUZPR: return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VZIPL: return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VZIPR: return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VTRNL: return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); case OP_VTRNR: return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS); } } static SDValue GenerateTBL(SDValue Op, ArrayRef ShuffleMask, SelectionDAG &DAG) { // Check to see if we can use the TBL instruction. SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc DL(Op); EVT EltVT = Op.getValueType().getVectorElementType(); unsigned BytesPerElt = EltVT.getSizeInBits() / 8; SmallVector TBLMask; for (int Val : ShuffleMask) { for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { unsigned Offset = Byte + Val * BytesPerElt; TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32)); } } MVT IndexVT = MVT::v8i8; unsigned IndexLen = 8; if (Op.getValueSizeInBits() == 128) { IndexVT = MVT::v16i8; IndexLen = 16; } SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); SDValue Shuffle; if (V2.getNode()->isUndef()) { if (IndexLen == 8) V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, DAG.getBuildVector(IndexVT, DL, makeArrayRef(TBLMask.data(), IndexLen))); } else { if (IndexLen == 8) { V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst, DAG.getBuildVector(IndexVT, DL, makeArrayRef(TBLMask.data(), IndexLen))); } else { // FIXME: We cannot, for the moment, emit a TBL2 instruction because we // cannot currently represent the register constraints on the input // table registers. // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, // DAG.getBuildVector(IndexVT, DL, &TBLMask[0], // IndexLen)); Shuffle = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst, V2Cst, DAG.getBuildVector(IndexVT, DL, makeArrayRef(TBLMask.data(), IndexLen))); } } return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); } static unsigned getDUPLANEOp(EVT EltType) { if (EltType == MVT::i8) return AArch64ISD::DUPLANE8; if (EltType == MVT::i16 || EltType == MVT::f16) return AArch64ISD::DUPLANE16; if (EltType == MVT::i32 || EltType == MVT::f32) return AArch64ISD::DUPLANE32; if (EltType == MVT::i64 || EltType == MVT::f64) return AArch64ISD::DUPLANE64; llvm_unreachable("Invalid vector element type?"); } SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); // Convert shuffles that are directly supported on NEON to target-specific // DAG nodes, instead of keeping them as shuffles and matching them again // during code selection. This is more efficient and avoids the possibility // of inconsistencies between legalization and selection. ArrayRef ShuffleMask = SVN->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) Lane = 0; if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), V1.getOperand(0)); // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- // constant. If so, we can just reference the lane's definition directly. if (V1.getOpcode() == ISD::BUILD_VECTOR && !isa(V1.getOperand(Lane))) return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); // Otherwise, duplicate from the lane of the input vector. unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); // SelectionDAGBuilder may have "helpfully" already extracted or conatenated // to make a vector of the same size as this SHUFFLE. We can ignore the // extract entirely, and canonicalise the concat using WidenVector. if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { Lane += cast(V1.getOperand(1))->getZExtValue(); V1 = V1.getOperand(0); } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; Lane -= Idx * VT.getVectorNumElements() / 2; V1 = WidenVector(V1.getOperand(Idx), DAG); } else if (VT.getSizeInBits() == 64) V1 = WidenVector(V1, DAG); return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); } if (isREVMask(ShuffleMask, VT, 64)) return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); if (isREVMask(ShuffleMask, VT, 32)) return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); if (isREVMask(ShuffleMask, VT, 16)) return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); bool ReverseEXT = false; unsigned Imm; if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { if (ReverseEXT) std::swap(V1, V2); Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, DAG.getConstant(Imm, dl, MVT::i32)); } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) { Imm *= getExtFactor(V1); return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } unsigned WhichResult; if (isZIPMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isUZPMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isTRNMask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); } if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); } if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG)) return Concat; bool DstIsLeft; int Anomaly; int NumInputElements = V1.getValueType().getVectorNumElements(); if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { SDValue DstVec = DstIsLeft ? V1 : V2; SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64); SDValue SrcVec = V1; int SrcLane = ShuffleMask[Anomaly]; if (SrcLane >= NumInputElements) { SrcVec = V2; SrcLane -= VT.getVectorNumElements(); } SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64); EVT ScalarVT = VT.getVectorElementType(); if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) ScalarVT = MVT::i32; return DAG.getNode( ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), DstLaneV); } // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); if (NumElts == 4) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (ShuffleMask[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = ShuffleMask[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + PFIndexes[2] * 9 + PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } return GenerateTBL(Op, ShuffleMask, DAG); } static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, APInt &UndefBits) { EVT VT = BVN->getValueType(0); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; for (unsigned i = 0; i < NumSplats; ++i) { CnstBits <<= SplatBitSize; UndefBits <<= SplatBitSize; CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); } return true; } return false; } // Try 64-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64; if (AArch64_AM::isAdvSIMDModImmType10(Value)) { Value = AArch64_AM::encodeAdvSIMDModImmType10(Value); SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 32-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS = nullptr) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; bool isAdvSIMDModImm = false; uint64_t Shift; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType1(Value); Shift = 0; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType2(Value); Shift = 8; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType3(Value); Shift = 16; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType4(Value); Shift = 24; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov; if (LHS) Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); else Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 16-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits, const SDValue *LHS = nullptr) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; bool isAdvSIMDModImm = false; uint64_t Shift; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType5(Value); Shift = 0; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType6(Value); Shift = 8; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov; if (LHS) Mov = DAG.getNode(NewOp, dl, MovTy, *LHS, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); else Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 32-bit splatted SIMD immediate with shifted ones. static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; bool isAdvSIMDModImm = false; uint64_t Shift; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType7(Value); Shift = 264; } else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType8(Value); Shift = 272; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32), DAG.getConstant(Shift, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try 8-bit splatted SIMD immediate. static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; if (AArch64_AM::isAdvSIMDModImmType9(Value)) { Value = AArch64_AM::encodeAdvSIMDModImmType9(Value); SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } // Try FP splatted SIMD immediate. static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG, const APInt &Bits) { if (Bits.getHiBits(64) == Bits.getLoBits(64)) { uint64_t Value = Bits.zextOrTrunc(64).getZExtValue(); EVT VT = Op.getValueType(); bool isWide = (VT.getSizeInBits() == 128); MVT MovTy; bool isAdvSIMDModImm = false; if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType11(Value); MovTy = isWide ? MVT::v4f32 : MVT::v2f32; } else if (isWide && (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) { Value = AArch64_AM::encodeAdvSIMDModImmType12(Value); MovTy = MVT::v2f64; } if (isAdvSIMDModImm) { SDLoc dl(Op); SDValue Mov = DAG.getNode(NewOp, dl, MovTy, DAG.getConstant(Value, dl, MVT::i32)); return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); } } return SDValue(); } SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, SelectionDAG &DAG) const { SDValue LHS = Op.getOperand(0); EVT VT = Op.getValueType(); BuildVectorSDNode *BVN = dyn_cast(Op.getOperand(1).getNode()); if (!BVN) { // AND commutes, so try swapping the operands. LHS = Op.getOperand(1); BVN = dyn_cast(Op.getOperand(0).getNode()); } if (!BVN) return Op; APInt DefBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); if (resolveBuildVector(BVN, DefBits, UndefBits)) { SDValue NewOp; // We only have BIC vector immediate instruction, which is and-not. DefBits = ~DefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG, DefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG, DefBits, &LHS))) return NewOp; UndefBits = ~UndefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG, UndefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG, UndefBits, &LHS))) return NewOp; } // We can always fall back to a non-immediate AND. return Op; } // Specialized code to quickly find if PotentialBVec is a BuildVector that // consists of only the same constant int value, returned in reference arg // ConstVal static bool isAllConstantBuildVector(const SDValue &PotentialBVec, uint64_t &ConstVal) { BuildVectorSDNode *Bvec = dyn_cast(PotentialBVec); if (!Bvec) return false; ConstantSDNode *FirstElt = dyn_cast(Bvec->getOperand(0)); if (!FirstElt) return false; EVT VT = Bvec->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 1; i < NumElts; ++i) if (dyn_cast(Bvec->getOperand(i)) != FirstElt) return false; ConstVal = FirstElt->getZExtValue(); return true; } static unsigned getIntrinsicID(const SDNode *N) { unsigned Opcode = N->getOpcode(); switch (Opcode) { default: return Intrinsic::not_intrinsic; case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast(N->getOperand(0))->getZExtValue(); if (IID < Intrinsic::num_intrinsics) return IID; return Intrinsic::not_intrinsic; } } } // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. // Also, logical shift right -> sri, with the same structure. static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); SDLoc DL(N); // Is the first op an AND? const SDValue And = N->getOperand(0); if (And.getOpcode() != ISD::AND) return SDValue(); // Is the second op an shl or lshr? SDValue Shift = N->getOperand(1); // This will have been turned into: AArch64ISD::VSHL vector, #shift // or AArch64ISD::VLSHR vector, #shift unsigned ShiftOpc = Shift.getOpcode(); if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) return SDValue(); bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; // Is the shift amount constant? ConstantSDNode *C2node = dyn_cast(Shift.getOperand(1)); if (!C2node) return SDValue(); // Is the and mask vector all constant? uint64_t C1; if (!isAllConstantBuildVector(And.getOperand(1), C1)) return SDValue(); // Is C1 == ~C2, taking into account how much one can shift elements of a // particular size? uint64_t C2 = C2node->getZExtValue(); unsigned ElemSizeInBits = VT.getScalarSizeInBits(); if (C2 > ElemSizeInBits) return SDValue(); unsigned ElemMask = (1 << ElemSizeInBits) - 1; if ((C1 & ElemMask) != (~C2 & ElemMask)) return SDValue(); SDValue X = And.getOperand(0); SDValue Y = Shift.getOperand(0); unsigned Intrin = IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; SDValue ResultSLI = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrin, DL, MVT::i32), X, Y, Shift.getOperand(1)); LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n"); LLVM_DEBUG(N->dump(&DAG)); LLVM_DEBUG(dbgs() << "into: \n"); LLVM_DEBUG(ResultSLI->dump(&DAG)); ++NumShiftInserts; return ResultSLI; } SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, SelectionDAG &DAG) const { // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) if (EnableAArch64SlrGeneration) { if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG)) return Res; } EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getOperand(1).getNode()); if (!BVN) { // OR commutes, so try swapping the operands. LHS = Op.getOperand(1); BVN = dyn_cast(Op.getOperand(0).getNode()); } if (!BVN) return Op; APInt DefBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); if (resolveBuildVector(BVN, DefBits, UndefBits)) { SDValue NewOp; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, DefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, DefBits, &LHS))) return NewOp; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG, UndefBits, &LHS)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG, UndefBits, &LHS))) return NewOp; } // We can always fall back to a non-immediate OR. return Op; } // Normalize the operands of BUILD_VECTOR. The value of constant operands will // be truncated to fit element width. static SDValue NormalizeBuildVector(SDValue Op, SelectionDAG &DAG) { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); EVT EltTy= VT.getVectorElementType(); if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) return Op; SmallVector Ops; for (SDValue Lane : Op->ops()) { if (auto *CstLane = dyn_cast(Lane)) { APInt LowBits(EltTy.getSizeInBits(), CstLane->getZExtValue()); Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32); } Ops.push_back(Lane); } return DAG.getBuildVector(VT, dl, Ops); } static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); APInt DefBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); BuildVectorSDNode *BVN = cast(Op.getNode()); if (resolveBuildVector(BVN, DefBits, UndefBits)) { SDValue NewOp; if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) return NewOp; DefBits = ~DefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) return NewOp; DefBits = UndefBits; if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) return NewOp; DefBits = ~UndefBits; if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) return NewOp; } return SDValue(); } SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); // Try to build a simple constant vector. Op = NormalizeBuildVector(Op, DAG); if (VT.isInteger()) { // Certain vector constants, used to express things like logical NOT and // arithmetic NEG, are passed through unmodified. This allows special // patterns for these operations to match, which will lower these constants // to whatever is proven necessary. BuildVectorSDNode *BVN = cast(Op.getNode()); if (BVN->isConstant()) if (ConstantSDNode *Const = BVN->getConstantSplatNode()) { unsigned BitSize = VT.getVectorElementType().getSizeInBits(); APInt Val(BitSize, Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue()); if (Val.isNullValue() || Val.isAllOnesValue()) return Op; } } if (SDValue V = ConstantBuildVector(Op, DAG)) return V; // Scan through the operands to find some interesting properties we can // exploit: // 1) If only one value is used, we can use a DUP, or // 2) if only the low element is not undef, we can just insert that, or // 3) if only one constant value is used (w/ some non-constant lanes), // we can splat the constant value into the whole vector then fill // in the non-constant lanes. // 4) FIXME: If different constant values are used, but we can intelligently // select the values we'll be overwriting for the non-constant // lanes such that we can directly materialize the vector // some other way (MOVI, e.g.), we can be sneaky. // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP. SDLoc dl(Op); unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; bool usesOnlyOneConstantValue = true; bool isConstant = true; bool AllLanesExtractElt = true; unsigned NumConstantLanes = 0; SDValue Value; SDValue ConstantValue; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) AllLanesExtractElt = false; if (V.isUndef()) continue; if (i > 0) isOnlyLowElement = false; if (!isa(V) && !isa(V)) isConstant = false; if (isa(V) || isa(V)) { ++NumConstantLanes; if (!ConstantValue.getNode()) ConstantValue = V; else if (ConstantValue != V) usesOnlyOneConstantValue = false; } if (!Value.getNode()) Value = V; else if (V != Value) usesOnlyOneValue = false; } if (!Value.getNode()) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n"); return DAG.getUNDEF(VT); } if (isOnlyLowElement) { LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 " "SCALAR_TO_VECTOR node\n"); return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); } if (AllLanesExtractElt) { SDNode *Vector = nullptr; bool Even = false; bool Odd = false; // Check whether the extract elements match the Even pattern <0,2,4,...> or // the Odd pattern <1,3,5,...>. for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); const SDNode *N = V.getNode(); if (!isa(N->getOperand(1))) break; SDValue N0 = N->getOperand(0); // All elements are extracted from the same vector. if (!Vector) { Vector = N0.getNode(); // Check that the type of EXTRACT_VECTOR_ELT matches the type of // BUILD_VECTOR. if (VT.getVectorElementType() != N0.getValueType().getVectorElementType()) break; } else if (Vector != N0.getNode()) { Odd = false; Even = false; break; } // Extracted values are either at Even indices <0,2,4,...> or at Odd // indices <1,3,5,...>. uint64_t Val = N->getConstantOperandVal(1); if (Val == 2 * i) { Even = true; continue; } if (Val - 1 == 2 * i) { Odd = true; continue; } // Something does not match: abort. Odd = false; Even = false; break; } if (Even || Odd) { SDValue LHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), DAG.getConstant(0, dl, MVT::i64)); SDValue RHS = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0), DAG.getConstant(NumElts, dl, MVT::i64)); if (Even && !Odd) return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS, RHS); if (Odd && !Even) return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS, RHS); } } // Use DUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. if (usesOnlyOneValue) { if (!isConstant) { if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Value.getValueType() != VT) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n"); return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); } // This is actually a DUPLANExx operation, which keeps everything vectory. SDValue Lane = Value.getOperand(1); Value = Value.getOperand(0); if (Value.getValueSizeInBits() == 64) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, " "widening it\n"); Value = WidenVector(Value, DAG); } unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); return DAG.getNode(Opcode, dl, VT, Value, Lane); } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; EVT EltTy = VT.getVectorElementType(); assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) && "Unsupported floating-point vector type"); LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int " "BITCASTS, and try again\n"); MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits()); for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: "; Val.dump();); Val = LowerBUILD_VECTOR(Val, DAG); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } } // If there was only one constant value used and for more than one lane, // start by splatting that value, then replace the non-constant lanes. This // is better than the default, which will perform a separate initialization // for each lane. if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { // Firstly, try to materialize the splat constant. SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue), Val = ConstantBuildVector(Vec, DAG); if (!Val) { // Otherwise, materialize the constant and splat it. Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); DAG.ReplaceAllUsesWith(Vec.getNode(), &Val); } // Now insert the non-constant lanes. for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); if (!isa(V) && !isa(V)) // Note that type legalization likely mucked about with the VT of the // source operand, so we may have to convert it here before inserting. Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); } return Val; } // This will generate a load from the constant pool. if (isConstant) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default " "expansion\n"); return SDValue(); } // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { if (SDValue shuffle = ReconstructShuffle(Op, DAG)) return shuffle; } // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's // scalar_to_vector for the elements followed by a shuffle (provided the // shuffle is valid for the target) and materialization element by element // on the stack followed by a load for everything else. if (!isConstant && !usesOnlyOneValue) { LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence " "of INSERT_VECTOR_ELT\n"); SDValue Vec = DAG.getUNDEF(VT); SDValue Op0 = Op.getOperand(0); unsigned i = 0; // Use SCALAR_TO_VECTOR for lane zero to // a) Avoid a RMW dependency on the full vector register, and // b) Allow the register coalescer to fold away the copy if the // value is already in an S or D register, and we're forced to emit an // INSERT_SUBREG that we can't fold anywhere. // // We also allow types like i8 and i16 which are illegal scalar but legal // vector element types. After type-legalization the inserted value is // extended (i32) and it is safe to cast them to the vector type by ignoring // the upper bits of the lowest lane (e.g. v8i8, v4i16). if (!Op0.isUndef()) { LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n"); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0); ++i; } LLVM_DEBUG(if (i < NumElts) dbgs() << "Creating nodes for the other vector elements:\n";); for (; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); } return Vec; } LLVM_DEBUG( dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find " "better alternative\n"); return SDValue(); } SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); // Check for non-constant or out of range lane. EVT VT = Op.getOperand(0).getValueType(); ConstantSDNode *CI = dyn_cast(Op.getOperand(2)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform insertion by expanding the value // to a V128 type and perform the insertion on that. SDLoc DL(Op); SDValue WideVec = WidenVector(Op.getOperand(0), DAG); EVT WideTy = WideVec.getValueType(); SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, Op.getOperand(1), Op.getOperand(2)); // Re-narrow the resultant vector. return NarrowVector(Node, DAG); } SDValue AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); // Check for non-constant or out of range lane. EVT VT = Op.getOperand(0).getValueType(); ConstantSDNode *CI = dyn_cast(Op.getOperand(1)); if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); // Insertion/extraction are legal for V128 types. if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v8f16) return Op; if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) return SDValue(); // For V64 types, we perform extraction by expanding the value // to a V128 type and perform the extraction on that. SDLoc DL(Op); SDValue WideVec = WidenVector(Op.getOperand(0), DAG); EVT WideTy = WideVec.getValueType(); EVT ExtrTy = WideTy.getVectorElementType(); if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) ExtrTy = MVT::i32; // For extractions, we just return the result directly. return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, Op.getOperand(1)); } SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getOperand(0).getValueType(); SDLoc dl(Op); // Just in case... if (!VT.isVector()) return SDValue(); ConstantSDNode *Cst = dyn_cast(Op.getOperand(1)); if (!Cst) return SDValue(); unsigned Val = Cst->getZExtValue(); unsigned Size = Op.getValueSizeInBits(); // This will get lowered to an appropriate EXTRACT_SUBREG in ISel. if (Val == 0) return Op; // If this is extracting the upper 64-bits of a 128-bit vector, we match // that directly. if (Size == 64 && Val * VT.getScalarSizeInBits() == 64) return Op; return SDValue(); } bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (M[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = M[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + PFIndexes[2] * 9 + PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) return true; } bool DummyBool; int DummyInt; unsigned DummyUnsigned; return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || isEXTMask(M, VT, DummyBool, DummyUnsigned) || // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || isZIPMask(M, VT, DummyUnsigned) || isTRN_v_undef_Mask(M, VT, DummyUnsigned) || isUZP_v_undef_Mask(M, VT, DummyUnsigned) || isZIP_v_undef_Mask(M, VT, DummyUnsigned) || isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || isConcatMask(M, VT, VT.getSizeInBits() == 128)); } /// getVShiftImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { // Ignore bit_converts. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, ElementBits) || SplatBitSize > ElementBits) return false; Cnt = SplatBits.getSExtValue(); return true; } /// isVShiftLImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift left operation. That value must be in the range: /// 0 <= Value < ElementBits for a left shift; or /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getScalarSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift right operation. The value must be in the range: /// 1 <= Value <= ElementBits for a right shift; or static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getScalarSizeInBits(); if (!getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); } SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc DL(Op); int64_t Cnt; if (!Op.getOperand(1).getValueType().isVector()) return Op; unsigned EltSize = VT.getScalarSizeInBits(); switch (Op.getOpcode()) { default: llvm_unreachable("unexpected shift opcode"); case ISD::SHL: if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL, MVT::i32), Op.getOperand(0), Op.getOperand(1)); case ISD::SRA: case ISD::SRL: // Right shift immediate if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) { unsigned Opc = (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; return DAG.getNode(Opc, DL, VT, Op.getOperand(0), DAG.getConstant(Cnt, DL, MVT::i32)); } // Right shift register. Note, there is not a shift right register // instruction, but the shift left register instruction takes a signed // value, where negative numbers specify a right shift. unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl : Intrinsic::aarch64_neon_ushl; // negate the shift amount SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); SDValue NegShiftLeft = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0), NegShift); return NegShiftLeft; } return SDValue(); } static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, AArch64CC::CondCode CC, bool NoNans, EVT VT, const SDLoc &dl, SelectionDAG &DAG) { EVT SrcVT = LHS.getValueType(); assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && "function only supposed to emit natural comparisons"); BuildVectorSDNode *BVN = dyn_cast(RHS.getNode()); APInt CnstBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); bool IsZero = IsCnst && (CnstBits == 0); if (SrcVT.getVectorElementType().isFloatingPoint()) { switch (CC) { default: return SDValue(); case AArch64CC::NE: { SDValue Fcmeq; if (IsZero) Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); else Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); } case AArch64CC::EQ: if (IsZero) return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); case AArch64CC::GE: if (IsZero) return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); case AArch64CC::GT: if (IsZero) return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); case AArch64CC::LS: if (IsZero) return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); case AArch64CC::LT: if (!NoNans) return SDValue(); // If we ignore NaNs then we can use to the MI implementation. LLVM_FALLTHROUGH; case AArch64CC::MI: if (IsZero) return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); } } switch (CC) { default: return SDValue(); case AArch64CC::NE: { SDValue Cmeq; if (IsZero) Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); else Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); } case AArch64CC::EQ: if (IsZero) return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); case AArch64CC::GE: if (IsZero) return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); case AArch64CC::GT: if (IsZero) return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); case AArch64CC::LE: if (IsZero) return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); case AArch64CC::LS: return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); case AArch64CC::LO: return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); case AArch64CC::LT: if (IsZero) return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); case AArch64CC::HI: return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); case AArch64CC::HS: return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); } } SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { ISD::CondCode CC = cast(Op.getOperand(2))->get(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); SDLoc dl(Op); if (LHS.getValueType().getVectorElementType().isInteger()) { assert(LHS.getValueType() == RHS.getValueType()); AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); SDValue Cmp = EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); } const bool FullFP16 = static_cast(DAG.getSubtarget()).hasFullFP16(); // Make v4f16 (only) fcmp operations utilise vector instructions // v8f16 support will be a litle more complicated if (LHS.getValueType().getVectorElementType() == MVT::f16) { if (!FullFP16 && LHS.getValueType().getVectorNumElements() == 4) { LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS); RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS); SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC); DAG.ReplaceAllUsesWith(Op, NewSetcc); CmpVT = MVT::v4i32; } else return SDValue(); } assert(LHS.getValueType().getVectorElementType() == MVT::f32 || LHS.getValueType().getVectorElementType() == MVT::f64); // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally // clean. Some of them require two branches to implement. AArch64CC::CondCode CC1, CC2; bool ShouldInvert; changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; SDValue Cmp = EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); if (!Cmp.getNode()) return SDValue(); if (CC2 != AArch64CC::AL) { SDValue Cmp2 = EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); if (!Cmp2.getNode()) return SDValue(); Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); } Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); if (ShouldInvert) return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); return Cmp; } static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp, SelectionDAG &DAG) { SDValue VecOp = ScalarOp.getOperand(0); auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx, DAG.getConstant(0, DL, MVT::i64)); } SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); switch (Op.getOpcode()) { case ISD::VECREDUCE_ADD: return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG); case ISD::VECREDUCE_SMAX: return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG); case ISD::VECREDUCE_SMIN: return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG); case ISD::VECREDUCE_UMAX: return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG); case ISD::VECREDUCE_UMIN: return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG); case ISD::VECREDUCE_FMAX: { assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag"); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32), Op.getOperand(0)); } case ISD::VECREDUCE_FMIN: { assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag"); return DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32), Op.getOperand(0)); } default: llvm_unreachable("Unhandled reduction"); } } SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const { auto &Subtarget = static_cast(DAG.getSubtarget()); if (!Subtarget.hasLSE()) return SDValue(); // LSE has an atomic load-add instruction, but not a load-sub. SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); SDValue RHS = Op.getOperand(2); AtomicSDNode *AN = cast(Op.getNode()); RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(), Op.getOperand(0), Op.getOperand(1), RHS, AN->getMemOperand()); } SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const { auto &Subtarget = static_cast(DAG.getSubtarget()); if (!Subtarget.hasLSE()) return SDValue(); // LSE has an atomic load-clear instruction, but not a load-and. SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); SDValue RHS = Op.getOperand(2); AtomicSDNode *AN = cast(Op.getNode()); RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS); return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(), Op.getOperand(0), Op.getOperand(1), RHS, AN->getMemOperand()); } SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC( SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const { SDLoc dl(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0); const uint32_t *Mask = Subtarget->getRegisterInfo()->getWindowsStackProbePreservedMask(); Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size, DAG.getConstant(4, dl, MVT::i64)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue()); Chain = DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue), Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64), DAG.getRegisterMask(Mask), Chain.getValue(1)); // To match the actual intent better, we should read the output from X15 here // again (instead of potentially spilling it to the stack), but rereading Size // from X15 here doesn't work at -O0, since it thinks that X15 is undefined // here. Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size, DAG.getConstant(4, dl, MVT::i64)); return Chain; } SDValue AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "Only Windows alloca probing supported"); SDLoc dl(Op); // Get the inputs. SDNode *Node = Op.getNode(); SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); unsigned Align = cast(Op.getOperand(2))->getZExtValue(); EVT VT = Node->getValueType(0); if (DAG.getMachineFunction().getFunction().hasFnAttribute( "no-stack-arg-probe")) { SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); } Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG); SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), DAG.getConstant(-(uint64_t)Align, dl, VT)); Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true), DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); SDValue Ops[2] = {SP, Chain}; return DAG.getMergeValues(Ops, dl); } /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { auto &DL = I.getModule()->getDataLayout(); switch (Intrinsic) { case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: case Intrinsic::aarch64_neon_ld1x2: case Intrinsic::aarch64_neon_ld1x3: case Intrinsic::aarch64_neon_ld1x4: case Intrinsic::aarch64_neon_ld2lane: case Intrinsic::aarch64_neon_ld3lane: case Intrinsic::aarch64_neon_ld4lane: case Intrinsic::aarch64_neon_ld2r: case Intrinsic::aarch64_neon_ld3r: case Intrinsic::aarch64_neon_ld4r: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align = 0; // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::aarch64_neon_st2: case Intrinsic::aarch64_neon_st3: case Intrinsic::aarch64_neon_st4: case Intrinsic::aarch64_neon_st1x2: case Intrinsic::aarch64_neon_st1x3: case Intrinsic::aarch64_neon_st1x4: case Intrinsic::aarch64_neon_st2lane: case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. unsigned NumElts = 0; for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align = 0; // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::aarch64_ldaxr: case Intrinsic::aarch64_ldxr: { PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } case Intrinsic::aarch64_ldaxp: case Intrinsic::aarch64_ldxp: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = 16; Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; case Intrinsic::aarch64_stlxp: case Intrinsic::aarch64_stxp: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i128; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; Info.align = 16; Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; default: break; } return false; } bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const { // If we're reducing the load width in order to avoid having to use an extra // instruction to do extension then it's probably a good idea. if (ExtTy != ISD::NON_EXTLOAD) return true; // Don't reduce load width if it would prevent us from combining a shift into // the offset. MemSDNode *Mem = dyn_cast(Load); assert(Mem); const SDValue &Base = Mem->getBasePtr(); if (Base.getOpcode() == ISD::ADD && Base.getOperand(1).getOpcode() == ISD::SHL && Base.getOperand(1).hasOneUse() && Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) { // The shift can be combined if it matches the size of the value being // loaded (and so reducing the width would make it not match). uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1); uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8; if (ShiftAmount == Log2_32(LoadBytes)) return false; } // We have no reason to disallow reducing the load width, so allow it. return true; } // Truncations from 64-bit GPR to 32-bit GPR is free. bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 > NumBits2; } bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 > NumBits2; } /// Check if it is profitable to hoist instruction in then/else to if. /// Not profitable if I and it's user can form a FMA instruction /// because we prefer FMSUB/FMADD. bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { if (I->getOpcode() != Instruction::FMul) return true; if (!I->hasOneUse()) return true; Instruction *User = I->user_back(); if (User && !(User->getOpcode() == Instruction::FSub || User->getOpcode() == Instruction::FAdd)) return true; const TargetOptions &Options = getTargetMachine().Options; const DataLayout &DL = I->getModule()->getDataLayout(); EVT VT = getValueType(DL, User->getOperand(0)->getType()); return !(isFMAFasterThanFMulAndFAdd(VT) && isOperationLegalOrCustom(ISD::FMA, VT) && (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); } // All 32-bit GPR operations implicitly zero the high-half of the corresponding // 64-bit GPR. bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); return NumBits1 == 32 && NumBits2 == 64; } bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) return false; unsigned NumBits1 = VT1.getSizeInBits(); unsigned NumBits2 = VT2.getSizeInBits(); return NumBits1 == 32 && NumBits2 == 64; } bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { EVT VT1 = Val.getValueType(); if (isZExtFree(VT1, VT2)) { return true; } if (Val.getOpcode() != ISD::LOAD) return false; // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && VT1.getSizeInBits() <= 32); } bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const { if (isa(Ext)) return false; // Vector types are not free. if (Ext->getType()->isVectorTy()) return false; for (const Use &U : Ext->uses()) { // The extension is free if we can fold it with a left shift in an // addressing mode or an arithmetic operation: add, sub, and cmp. // Is there a shift? const Instruction *Instr = cast(U.getUser()); // Is this a constant shift? switch (Instr->getOpcode()) { case Instruction::Shl: if (!isa(Instr->getOperand(1))) return false; break; case Instruction::GetElementPtr: { gep_type_iterator GTI = gep_type_begin(Instr); auto &DL = Ext->getModule()->getDataLayout(); std::advance(GTI, U.getOperandNo()-1); Type *IdxTy = GTI.getIndexedType(); // This extension will end up with a shift because of the scaling factor. // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0. // Get the shift amount based on the scaling factor: // log2(sizeof(IdxTy)) - log2(8). uint64_t ShiftAmt = countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3; // Is the constant foldable in the shift of the addressing mode? // I.e., shift amount is between 1 and 4 inclusive. if (ShiftAmt == 0 || ShiftAmt > 4) return false; break; } case Instruction::Trunc: // Check if this is a noop. // trunc(sext ty1 to ty2) to ty1. if (Instr->getType() == Ext->getOperand(0)->getType()) continue; LLVM_FALLTHROUGH; default: return false; } // At this point we can use the bfm family, so this extension is free // for that use. } return true; } bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const { if (!LoadedType.isSimple() || (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) return false; // Cyclone supports unaligned accesses. RequiredAligment = 0; unsigned NumBits = LoadedType.getSizeInBits(); return NumBits == 32 || NumBits == 64; } /// A helper function for determining the number of interleaved accesses we /// will generate when lowering accesses of the given type. unsigned AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const { return (DL.getTypeSizeInBits(VecTy) + 127) / 128; } MachineMemOperand::Flags AArch64TargetLowering::getMMOFlags(const Instruction &I) const { if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) return MOStridedAccess; return MachineMemOperand::MONone; } bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); // Ensure the number of vector elements is greater than 1. if (VecTy->getNumElements() < 2) return false; // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) return false; // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. return VecSize == 64 || VecSize % 128 == 0; } /// Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements /// /// Into: /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr) /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 bool AArch64TargetLowering::lowerInterleavedLoad( LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); const DataLayout &DL = LI->getModule()->getDataLayout(); VectorType *VecTy = Shuffles[0]->getType(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isPointerTy()) VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); IRBuilder<> Builder(LI); // The base address of the load. Value *BaseAddr = LI->getPointerOperand(); if (NumLoads > 1) { // If we're going to generate more than one load, reset the sub-vector type // to something legal. VecTy = VectorType::get(VecTy->getVectorElementType(), VecTy->getVectorNumElements() / NumLoads); // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, VecTy->getVectorElementType()->getPointerTo( LI->getPointerAddressSpace())); } Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace()); Type *Tys[2] = {VecTy, PtrTy}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, Intrinsic::aarch64_neon_ld3, Intrinsic::aarch64_neon_ld4}; Function *LdNFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will // replace. DenseMap> SubVecs; for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) BaseAddr = Builder.CreateConstGEP1_32( BaseAddr, VecTy->getVectorNumElements() * Factor); CallInst *LdN = Builder.CreateCall( LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); // Extract and store the sub-vectors returned by the load intrinsic. for (unsigned i = 0; i < Shuffles.size(); i++) { ShuffleVectorInst *SVI = Shuffles[i]; unsigned Index = Indices[i]; Value *SubVec = Builder.CreateExtractValue(LdN, Index); // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( SubVec, VectorType::get(SVI->getType()->getVectorElementType(), VecTy->getVectorNumElements())); SubVecs[SVI].push_back(SubVec); } } // Replace uses of the shufflevector instructions with the sub-vectors // returned by the load intrinsic. If a shufflevector instruction is // associated with more than one sub-vector, those sub-vectors will be // concatenated into a single wide vector. for (ShuffleVectorInst *SVI : Shuffles) { auto &SubVec = SubVecs[SVI]; auto *WideVec = SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; SVI->replaceAllUsesWith(WideVec); } return true; } /// Lower an interleaved store into a stN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) /// /// Note that the new shufflevectors will be removed and we'll only generate one /// st3 instruction in CodeGen. /// /// Example for a more general valid mask (Factor 3). Lower: /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr) bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); VectorType *VecTy = SVI->getType(); assert(VecTy->getVectorNumElements() % Factor == 0 && "Invalid interleaved store"); unsigned LaneLen = VecTy->getVectorNumElements() / Factor; Type *EltTy = VecTy->getVectorElementType(); VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); // StN intrinsics don't support pointer vectors as arguments. Convert pointer // vectors to integer vectors. if (EltTy->isPointerTy()) { Type *IntTy = DL.getIntPtrType(EltTy); unsigned NumOpElts = Op0->getType()->getVectorNumElements(); // Convert to the corresponding integer vector. Type *IntVecTy = VectorType::get(IntTy, NumOpElts); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); SubVecTy = VectorType::get(IntTy, LaneLen); } // The base address of the store. Value *BaseAddr = SI->getPointerOperand(); if (NumStores > 1) { // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( SI->getPointerAddressSpace())); } auto Mask = SVI->getShuffleMask(); Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); Type *Tys[2] = {SubVecTy, PtrTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, Intrinsic::aarch64_neon_st3, Intrinsic::aarch64_neon_st4}; Function *StNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { SmallVector Ops; // Split the shufflevector operands into sub vectors for the new stN call. for (unsigned i = 0; i < Factor; i++) { unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Ops.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { unsigned IdxJ = StoreCount * LaneLen * Factor + j; if (Mask[IdxJ * Factor + IdxI] >= 0) { StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; break; } } // Note: Filling undef gaps with random elements is ok, since // those elements were being written anyway (with undefs). // In the case of all undefs we're defaulting to using elems from 0 // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Ops.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } } // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); Builder.CreateCall(StNFunc, Ops); } return true; } static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, unsigned AlignCheck) { return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && (DstAlign == 0 || DstAlign % AlignCheck == 0)); } EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { // Don't use AdvSIMD to implement 16-byte memset. It would have taken one // instruction to materialize the v2i64 zero and one store (with restrictive // addressing mode). Just do two i64 store of zero-registers. bool Fast; const Function &F = MF.getFunction(); if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && !F.hasFnAttribute(Attribute::NoImplicitFloat) && (memOpAlign(SrcAlign, DstAlign, 16) || (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) return MVT::f128; if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) || (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast))) return MVT::i64; if (Size >= 4 && (memOpAlign(SrcAlign, DstAlign, 4) || (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast))) return MVT::i32; return MVT::Other; } // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if (Immed == std::numeric_limits::min()) { LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed << ": avoid UB for INT64_MIN\n"); return false; } // Same encoding for add/sub, just flip the sign. Immed = std::abs(Immed); bool IsLegal = ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)); LLVM_DEBUG(dbgs() << "Is " << Immed << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n"); return IsLegal; } // Integer comparisons are implemented with ADDS/SUBS, so the range of valid // immediates is the same as for an add or a sub. bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { return isLegalAddImmediate(Immed); } /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const { // AArch64 has five basic addressing modes: // reg // reg + 9-bit signed offset // reg + SIZE_IN_BYTES * 12-bit unsigned offset // reg1 + reg2 // reg + SIZE_IN_BYTES * reg // No global is ever allowed as a base. if (AM.BaseGV) return false; // No reg+reg+imm addressing. if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) return false; // check reg + imm case: // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 uint64_t NumBytes = 0; if (Ty->isSized()) { uint64_t NumBits = DL.getTypeSizeInBits(Ty); NumBytes = NumBits / 8; if (!isPowerOf2_64(NumBits)) NumBytes = 0; } if (!AM.Scale) { int64_t Offset = AM.BaseOffs; // 9-bit signed offset if (isInt<9>(Offset)) return true; // 12-bit unsigned offset unsigned shift = Log2_64(NumBytes); if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && // Must be a multiple of NumBytes (NumBytes is a power of 2) (Offset >> shift) << shift == Offset) return true; return false; } // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes); } bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const { // Consider splitting large offset of struct or array. return true; } int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { // Scaling factors are not free at all. // Operands | Rt Latency // ------------------------------------------- // Rt, [Xn, Xm] | 4 // ------------------------------------------- // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 // Rt, [Xn, Wm, #imm] | if (isLegalAddressingMode(DL, AM, Ty, AS)) // Scale represents reg2 * scale, thus account for 1 if // it is not equal to 0 or 1. return AM.Scale != 0 && AM.Scale != 1; return -1; } bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { VT = VT.getScalarType(); if (!VT.isSimple()) return false; switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: case MVT::f64: return true; default: break; } return false; } const MCPhysReg * AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { // LR is a callee-save register, but we must treat it as clobbered by any call // site. Hence we include LR in the scratch registers, which are in turn added // as implicit-defs for stackmaps and patchpoints. static const MCPhysReg ScratchRegs[] = { AArch64::X16, AArch64::X17, AArch64::LR, 0 }; return ScratchRegs; } bool -AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const { +AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N, + CombineLevel Level) const { + N = N->getOperand(0).getNode(); EVT VT = N->getValueType(0); // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine // it with shift to let it be lowered to UBFX. if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && isa(N->getOperand(1))) { uint64_t TruncMask = N->getConstantOperandVal(1); if (isMask_64(TruncMask) && N->getOperand(0).getOpcode() == ISD::SRL && isa(N->getOperand(0)->getOperand(1))) return false; } return true; } bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0) return false; int64_t Val = Imm.getSExtValue(); if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) return true; if ((int64_t)Val < 0) Val = ~Val; if (BitSize == 32) Val &= (1LL << 32) - 1; unsigned LZ = countLeadingZeros((uint64_t)Val); unsigned Shift = (63 - LZ) / 16; // MOVZ is free so return true for one or fewer MOVK. return Shift < 3; } bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; return (Index == 0 || Index == ResVT.getVectorNumElements()); } /// Turn vector tests of the signbit in the form of: /// xor (sra X, elt_size(X)-1), -1 /// into: /// cmge X, X, #0 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { EVT VT = N->getValueType(0); if (!Subtarget->hasNEON() || !VT.isVector()) return SDValue(); // There must be a shift right algebraic before the xor, and the xor must be a // 'not' operation. SDValue Shift = N->getOperand(0); SDValue Ones = N->getOperand(1); if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() || !ISD::isBuildVectorAllOnes(Ones.getNode())) return SDValue(); // The shift should be smearing the sign bit across each vector element. auto *ShiftAmt = dyn_cast(Shift.getOperand(1)); EVT ShiftEltTy = Shift.getValueType().getVectorElementType(); if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1) return SDValue(); return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0)); } // Generate SUBS and CSEL for integer abs. static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDLoc DL(N); // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) // and change it to SUB and CSEL. if (VT.isInteger() && N->getOpcode() == ISD::XOR && N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) if (ConstantSDNode *Y1C = dyn_cast(N1.getOperand(1))) if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0.getOperand(0)); // Generate SUBS & CSEL. SDValue Cmp = DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), N0.getOperand(0), DAG.getConstant(0, DL, VT)); return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, DAG.getConstant(AArch64CC::PL, DL, MVT::i32), SDValue(Cmp.getNode(), 1)); } return SDValue(); } static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) return Cmp; return performIntegerAbsCombine(N, DAG); } SDValue AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const { AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); if (isIntDivCheap(N->getValueType(0), Attr)) return SDValue(N,0); // Lower SDIV as SDIV // fold (sdiv X, pow2) EVT VT = N->getValueType(0); if ((VT != MVT::i32 && VT != MVT::i64) || !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) return SDValue(); SDLoc DL(N); SDValue N0 = N->getOperand(0); unsigned Lg2 = Divisor.countTrailingZeros(); SDValue Zero = DAG.getConstant(0, DL, VT); SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT); // Add (N0 < 0) ? Pow2 - 1 : 0; SDValue CCVal; SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); Created.push_back(Cmp.getNode()); Created.push_back(Add.getNode()); Created.push_back(CSel.getNode()); // Divide by pow2. SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64)); // If we're dividing by a positive value, we're done. Otherwise, we must // negate the result. if (Divisor.isNonNegative()) return SRA; Created.push_back(SRA.getNode()); return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); } static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (DCI.isBeforeLegalizeOps()) return SDValue(); // The below optimizations require a constant RHS. if (!isa(N->getOperand(1))) return SDValue(); ConstantSDNode *C = cast(N->getOperand(1)); const APInt &ConstValue = C->getAPIntValue(); // Multiplication of a power of two plus/minus one can be done more // cheaply as as shift+add/sub. For now, this is true unilaterally. If // future CPUs have a cheaper MADD instruction, this may need to be // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and // 64-bit is 5 cycles, so this is always a win. // More aggressively, some multiplications N0 * C can be lowered to // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M, // e.g. 6=3*2=(2+1)*2. // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 // which equals to (1+2)*16-(1+2). SDValue N0 = N->getOperand(0); // TrailingZeroes is used to test if the mul can be lowered to // shift+add+shift. unsigned TrailingZeroes = ConstValue.countTrailingZeros(); if (TrailingZeroes) { // Conservatively do not lower to shift+add+shift if the mul might be // folded into smul or umul. if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) || isZeroExtended(N0.getNode(), DAG))) return SDValue(); // Conservatively do not lower to shift+add+shift if the mul might be // folded into madd or msub. if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD || N->use_begin()->getOpcode() == ISD::SUB)) return SDValue(); } // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub // and shift+add+shift. APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes); unsigned ShiftAmt, AddSubOpc; // Is the shifted value the LHS operand of the add/sub? bool ShiftValUseIsN0 = true; // Do we need to negate the result? bool NegateResult = false; if (ConstValue.isNonNegative()) { // (mul x, 2^N + 1) => (add (shl x, N), x) // (mul x, 2^N - 1) => (sub (shl x, N), x) // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M) APInt SCVMinus1 = ShiftedConstValue - 1; APInt CVPlus1 = ConstValue + 1; if (SCVMinus1.isPowerOf2()) { ShiftAmt = SCVMinus1.logBase2(); AddSubOpc = ISD::ADD; } else if (CVPlus1.isPowerOf2()) { ShiftAmt = CVPlus1.logBase2(); AddSubOpc = ISD::SUB; } else return SDValue(); } else { // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) // (mul x, -(2^N + 1)) => - (add (shl x, N), x) APInt CVNegPlus1 = -ConstValue + 1; APInt CVNegMinus1 = -ConstValue - 1; if (CVNegPlus1.isPowerOf2()) { ShiftAmt = CVNegPlus1.logBase2(); AddSubOpc = ISD::SUB; ShiftValUseIsN0 = false; } else if (CVNegMinus1.isPowerOf2()) { ShiftAmt = CVNegMinus1.logBase2(); AddSubOpc = ISD::ADD; NegateResult = true; } else return SDValue(); } SDLoc DL(N); EVT VT = N->getValueType(0); SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShiftAmt, DL, MVT::i64)); SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0; SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal; SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1); assert(!(NegateResult && TrailingZeroes) && "NegateResult and TrailingZeroes cannot both be true for now."); // Negate the result. if (NegateResult) return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res); // Shift the result. if (TrailingZeroes) return DAG.getNode(ISD::SHL, DL, VT, Res, DAG.getConstant(TrailingZeroes, DL, MVT::i64)); return Res; } static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, SelectionDAG &DAG) { // Take advantage of vector comparisons producing 0 or -1 in each lane to // optimize away operation when it's from a constant. // // The general transformation is: // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> // AND(VECTOR_CMP(x,y), constant2) // constant2 = UNARYOP(constant) // Early exit if this isn't a vector operation, the operand of the // unary operation isn't a bitwise AND, or if the sizes of the operations // aren't the same. EVT VT = N->getValueType(0); if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) return SDValue(); // Now check that the other operand of the AND is a constant. We could // make the transformation for non-constant splats as well, but it's unclear // that would be a benefit as it would not eliminate any operations, just // perform one more step in scalar code before moving to the vector unit. if (BuildVectorSDNode *BV = dyn_cast(N->getOperand(0)->getOperand(1))) { // Bail out if the vector isn't a constant. if (!BV->isConstant()) return SDValue(); // Everything checks out. Build up the new and improved node. SDLoc DL(N); EVT IntVT = BV->getValueType(0); // Create a new constant of the appropriate type for the transformed // DAG. SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); // The AND node needs bitcasts to/from an integer vector type around it. SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, N->getOperand(0)->getOperand(0), MaskConst); SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); return Res; } return SDValue(); } static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { // First try to optimize away the conversion when it's conditionally from // a constant. Vectors only. if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG)) return Res; EVT VT = N->getValueType(0); if (VT != MVT::f32 && VT != MVT::f64) return SDValue(); // Only optimize when the source and destination types have the same width. if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits()) return SDValue(); // If the result of an integer load is only used by an integer-to-float // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. // This eliminates an "integer-to-vector-move" UOP and improves throughput. SDValue N0 = N->getOperand(0); if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile()) { LoadSDNode *LN0 = cast(N0); SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), LN0->getPointerInfo(), LN0->getAlignment(), LN0->getMemOperand()->getFlags()); // Make sure successors of the original load stay after it by updating them // to use the new Chain. DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); unsigned Opcode = (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; return DAG.getNode(Opcode, SDLoc(N), VT, Load); } return SDValue(); } /// Fold a floating-point multiply by power of two into floating-point to /// fixed-point conversion. static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL) return SDValue(); SDValue ConstVec = Op->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); uint32_t FloatBits = FloatTy.getSizeInBits(); if (FloatBits != 32 && FloatBits != 64) return SDValue(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); if (IntBits != 16 && IntBits != 32 && IntBits != 64) return SDValue(); // Avoid conversions where iN is larger than the float (e.g., float -> i64). if (IntBits > FloatBits) return SDValue(); BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t Bits = IntBits == 64 ? 64 : 32; int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1); if (C == -1 || C == 0 || C > Bits) return SDValue(); MVT ResTy; unsigned NumLanes = Op.getValueType().getVectorNumElements(); switch (NumLanes) { default: return SDValue(); case 2: ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; break; } if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) return SDValue(); assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) && "Illegal vector type after legalization"); SDLoc DL(N); bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs : Intrinsic::aarch64_neon_vcvtfp2fxu; SDValue FixConv = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy, DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32)); // We can handle smaller integers by generating an extra trunc. if (IntBits < FloatBits) FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv); return FixConv; } /// Fold a floating-point divide by power of two into fixed-point to /// floating-point conversion. static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); unsigned Opc = Op->getOpcode(); if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || !Op.getOperand(0).getValueType().isSimple() || (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP)) return SDValue(); SDValue ConstVec = N->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); int32_t IntBits = IntTy.getSizeInBits(); if (IntBits != 16 && IntBits != 32 && IntBits != 64) return SDValue(); MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); int32_t FloatBits = FloatTy.getSizeInBits(); if (FloatBits != 32 && FloatBits != 64) return SDValue(); // Avoid conversions where iN is larger than the float (e.g., i64 -> float). if (IntBits > FloatBits) return SDValue(); BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1); if (C == -1 || C == 0 || C > FloatBits) return SDValue(); MVT ResTy; unsigned NumLanes = Op.getValueType().getVectorNumElements(); switch (NumLanes) { default: return SDValue(); case 2: ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64; break; case 4: ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64; break; } if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps()) return SDValue(); SDLoc DL(N); SDValue ConvInput = Op.getOperand(0); bool IsSigned = Opc == ISD::SINT_TO_FP; if (IntBits < FloatBits) ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, ResTy, ConvInput); unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp : Intrinsic::aarch64_neon_vcvtfxu2fp; return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput, DAG.getConstant(C, DL, MVT::i32)); } /// An EXTR instruction is made up of two shifts, ORed together. This helper /// searches for and classifies those shifts. static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, bool &FromHi) { if (N.getOpcode() == ISD::SHL) FromHi = false; else if (N.getOpcode() == ISD::SRL) FromHi = true; else return false; if (!isa(N.getOperand(1))) return false; ShiftAmount = N->getConstantOperandVal(1); Src = N->getOperand(0); return true; } /// EXTR instruction extracts a contiguous chunk of bits from two existing /// registers viewed as a high/low pair. This function looks for the pattern: /// (or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N)) and replaces it /// with an EXTR. Can't quite be done in TableGen because the two immediates /// aren't independent. static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); EVT VT = N->getValueType(0); assert(N->getOpcode() == ISD::OR && "Unexpected root"); if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); SDValue LHS; uint32_t ShiftLHS = 0; bool LHSFromHi = false; if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) return SDValue(); SDValue RHS; uint32_t ShiftRHS = 0; bool RHSFromHi = false; if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) return SDValue(); // If they're both trying to come from the high part of the register, they're // not really an EXTR. if (LHSFromHi == RHSFromHi) return SDValue(); if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) return SDValue(); if (LHSFromHi) { std::swap(LHS, RHS); std::swap(ShiftLHS, ShiftRHS); } return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, DAG.getConstant(ShiftRHS, DL, MVT::i64)); } static SDValue tryCombineToBSL(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); if (!VT.isVector()) return SDValue(); SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::AND) return SDValue(); SDValue N1 = N->getOperand(1); if (N1.getOpcode() != ISD::AND) return SDValue(); // We only have to look for constant vectors here since the general, variable // case can be handled in TableGen. unsigned Bits = VT.getScalarSizeInBits(); uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); for (int i = 1; i >= 0; --i) for (int j = 1; j >= 0; --j) { BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(i)); BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(j)); if (!BVN0 || !BVN1) continue; bool FoundMatch = true; for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { ConstantSDNode *CN0 = dyn_cast(BVN0->getOperand(k)); ConstantSDNode *CN1 = dyn_cast(BVN1->getOperand(k)); if (!CN0 || !CN1 || CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { FoundMatch = false; break; } } if (FoundMatch) return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), N0->getOperand(1 - i), N1->getOperand(1 - j)); } return SDValue(); } static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); if (SDValue Res = tryCombineToEXTR(N, DCI)) return Res; if (SDValue Res = tryCombineToBSL(N, DCI)) return Res; return SDValue(); } static SDValue performSRLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32) // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero. SDValue N0 = N->getOperand(0); if (N0.getOpcode() == ISD::BSWAP) { SDLoc DL(N); SDValue N1 = N->getOperand(1); SDValue N00 = N0.getOperand(0); if (ConstantSDNode *C = dyn_cast(N1)) { uint64_t ShiftAmt = C->getZExtValue(); if (VT == MVT::i32 && ShiftAmt == 16 && DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16))) return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); if (VT == MVT::i64 && ShiftAmt == 32 && DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32))) return DAG.getNode(ISD::ROTR, DL, VT, N0, N1); } } return SDValue(); } static SDValue performBitcastCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // Wait 'til after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // Remove extraneous bitcasts around an extract_subvector. // For example, // (v4i16 (bitconvert // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) // becomes // (extract_subvector ((v8i16 ...), (i64 4))) // Only interested in 64-bit vectors as the ultimate result. EVT VT = N->getValueType(0); if (!VT.isVector()) return SDValue(); if (VT.getSimpleVT().getSizeInBits() != 64) return SDValue(); // Is the operand an extract_subvector starting at the beginning or halfway // point of the vector? A low half may also come through as an // EXTRACT_SUBREG, so look for that, too. SDValue Op0 = N->getOperand(0); if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && !(Op0->isMachineOpcode() && Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) return SDValue(); uint64_t idx = cast(Op0->getOperand(1))->getZExtValue(); if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) return SDValue(); } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { if (idx != AArch64::dsub) return SDValue(); // The dsub reference is equivalent to a lane zero subvector reference. idx = 0; } // Look through the bitcast of the input to the extract. if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) return SDValue(); SDValue Source = Op0->getOperand(0)->getOperand(0); // If the source type has twice the number of elements as our destination // type, we know this is an extract of the high or low half of the vector. EVT SVT = Source->getValueType(0); if (!SVT.isVector() || SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) return SDValue(); LLVM_DEBUG( dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); // Create the simplified form to just extract the low or high half of the // vector directly rather than bothering with the bitcasts. SDLoc dl(N); unsigned NumElements = VT.getVectorNumElements(); if (idx) { SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); } else { SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32); return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, Source, SubReg), 0); } } static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { SDLoc dl(N); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0), N1 = N->getOperand(1); // Optimize concat_vectors of truncated vectors, where the intermediate // type is illegal, to avoid said illegality, e.g., // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))), // (v2i16 (truncate (v2i64))))) // -> // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))), // (v4i32 (bitcast (v2i64))), // <0, 2, 4, 6>))) // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed // on both input and result type, so we might generate worse code. // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8. if (N->getNumOperands() == 2 && N0->getOpcode() == ISD::TRUNCATE && N1->getOpcode() == ISD::TRUNCATE) { SDValue N00 = N0->getOperand(0); SDValue N10 = N1->getOperand(0); EVT N00VT = N00.getValueType(); if (N00VT == N10.getValueType() && (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) && N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) { MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16); SmallVector Mask(MidVT.getVectorNumElements()); for (size_t i = 0; i < Mask.size(); ++i) Mask[i] = i * 2; return DAG.getNode(ISD::TRUNCATE, dl, VT, DAG.getVectorShuffle( MidVT, dl, DAG.getNode(ISD::BITCAST, dl, MidVT, N00), DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask)); } } // Wait 'til after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector // splat. The indexed instructions are going to be expecting a DUPLANE64, so // canonicalise to that. if (N0 == N1 && VT.getVectorNumElements() == 2) { assert(VT.getScalarSizeInBits() == 64); return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG), DAG.getConstant(0, dl, MVT::i64)); } // Canonicalise concat_vectors so that the right-hand vector has as few // bit-casts as possible before its real operation. The primary matching // destination for these operations will be the narrowing "2" instructions, // which depend on the operation being performed on this right-hand vector. // For example, // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) // becomes // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) if (N1->getOpcode() != ISD::BITCAST) return SDValue(); SDValue RHS = N1->getOperand(0); MVT RHSTy = RHS.getValueType().getSimpleVT(); // If the RHS is not a vector, this is not the pattern we're looking for. if (!RHSTy.isVector()) return SDValue(); LLVM_DEBUG( dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), RHSTy.getVectorNumElements() * 2); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, DAG.getNode(ISD::BITCAST, dl, RHSTy, N0), RHS)); } static SDValue tryCombineFixedPointConvert(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // Wait until after everything is legalized to try this. That way we have // legal vector types and such. if (DCI.isBeforeLegalizeOps()) return SDValue(); // Transform a scalar conversion of a value from a lane extract into a // lane extract of a vector conversion. E.g., from foo1 to foo2: // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } // // The second form interacts better with instruction selection and the // register allocator to avoid cross-class register copies that aren't // coalescable due to a lane reference. // Check the operand and see if it originates from a lane extract. SDValue Op1 = N->getOperand(1); if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { // Yep, no additional predication needed. Perform the transform. SDValue IID = N->getOperand(0); SDValue Shift = N->getOperand(2); SDValue Vec = Op1.getOperand(0); SDValue Lane = Op1.getOperand(1); EVT ResTy = N->getValueType(0); EVT VecResTy; SDLoc DL(N); // The vector width should be 128 bits by the time we get here, even // if it started as 64 bits (the extract_vector handling will have // done so). assert(Vec.getValueSizeInBits() == 128 && "unexpected vector size on extract_vector_elt!"); if (Vec.getValueType() == MVT::v4i32) VecResTy = MVT::v4f32; else if (Vec.getValueType() == MVT::v2i64) VecResTy = MVT::v2f64; else llvm_unreachable("unexpected vector type!"); SDValue Convert = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); } return SDValue(); } // AArch64 high-vector "long" operations are formed by performing the non-high // version on an extract_subvector of each operand which gets the high half: // // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) // // However, there are cases which don't have an extract_high explicitly, but // have another operation that can be made compatible with one for free. For // example: // // (dupv64 scalar) --> (extract_high (dup128 scalar)) // // This routine does the actual conversion of such DUPs, once outer routines // have determined that everything else is in order. // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold // similarly here. static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { switch (N.getOpcode()) { case AArch64ISD::DUP: case AArch64ISD::DUPLANE8: case AArch64ISD::DUPLANE16: case AArch64ISD::DUPLANE32: case AArch64ISD::DUPLANE64: case AArch64ISD::MOVI: case AArch64ISD::MOVIshift: case AArch64ISD::MOVIedit: case AArch64ISD::MOVImsl: case AArch64ISD::MVNIshift: case AArch64ISD::MVNImsl: break; default: // FMOV could be supported, but isn't very useful, as it would only occur // if you passed a bitcast' floating point immediate to an eligible long // integer op (addl, smull, ...). return SDValue(); } MVT NarrowTy = N.getSimpleValueType(); if (!NarrowTy.is64BitVector()) return SDValue(); MVT ElementTy = NarrowTy.getVectorElementType(); unsigned NumElems = NarrowTy.getVectorNumElements(); MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2); SDLoc dl(N); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy, DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()), DAG.getConstant(NumElems, dl, MVT::i64)); } static bool isEssentiallyExtractSubvector(SDValue N) { if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) return true; return N.getOpcode() == ISD::BITCAST && N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; } /// Helper structure to keep track of ISD::SET_CC operands. struct GenericSetCCInfo { const SDValue *Opnd0; const SDValue *Opnd1; ISD::CondCode CC; }; /// Helper structure to keep track of a SET_CC lowered into AArch64 code. struct AArch64SetCCInfo { const SDValue *Cmp; AArch64CC::CondCode CC; }; /// Helper structure to keep track of SetCC information. union SetCCInfo { GenericSetCCInfo Generic; AArch64SetCCInfo AArch64; }; /// Helper structure to be able to read SetCC information. If set to /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a /// GenericSetCCInfo. struct SetCCInfoAndKind { SetCCInfo Info; bool IsAArch64; }; /// Check whether or not \p Op is a SET_CC operation, either a generic or /// an /// AArch64 lowered one. /// \p SetCCInfo is filled accordingly. /// \post SetCCInfo is meanginfull only when this function returns true. /// \return True when Op is a kind of SET_CC operation. static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { // If this is a setcc, this is straight forward. if (Op.getOpcode() == ISD::SETCC) { SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); SetCCInfo.Info.Generic.CC = cast(Op.getOperand(2))->get(); SetCCInfo.IsAArch64 = false; return true; } // Otherwise, check if this is a matching csel instruction. // In other words: // - csel 1, 0, cc // - csel 0, 1, !cc if (Op.getOpcode() != AArch64ISD::CSEL) return false; // Set the information about the operands. // TODO: we want the operands of the Cmp not the csel SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); SetCCInfo.IsAArch64 = true; SetCCInfo.Info.AArch64.CC = static_cast( cast(Op.getOperand(2))->getZExtValue()); // Check that the operands matches the constraints: // (1) Both operands must be constants. // (2) One must be 1 and the other must be 0. ConstantSDNode *TValue = dyn_cast(Op.getOperand(0)); ConstantSDNode *FValue = dyn_cast(Op.getOperand(1)); // Check (1). if (!TValue || !FValue) return false; // Check (2). if (!TValue->isOne()) { // Update the comparison when we are interested in !cc. std::swap(TValue, FValue); SetCCInfo.Info.AArch64.CC = AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); } return TValue->isOne() && FValue->isNullValue(); } // Returns true if Op is setcc or zext of setcc. static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { if (isSetCC(Op, Info)) return true; return ((Op.getOpcode() == ISD::ZERO_EXTEND) && isSetCC(Op->getOperand(0), Info)); } // The folding we want to perform is: // (add x, [zext] (setcc cc ...) ) // --> // (csel x, (add x, 1), !cc ...) // // The latter will get matched to a CSINC instruction. static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); SDValue LHS = Op->getOperand(0); SDValue RHS = Op->getOperand(1); SetCCInfoAndKind InfoAndKind; // If neither operand is a SET_CC, give up. if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { std::swap(LHS, RHS); if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) return SDValue(); } // FIXME: This could be generatized to work for FP comparisons. EVT CmpVT = InfoAndKind.IsAArch64 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() : InfoAndKind.Info.Generic.Opnd0->getValueType(); if (CmpVT != MVT::i32 && CmpVT != MVT::i64) return SDValue(); SDValue CCVal; SDValue Cmp; SDLoc dl(Op); if (InfoAndKind.IsAArch64) { CCVal = DAG.getConstant( AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl, MVT::i32); Cmp = *InfoAndKind.Info.AArch64.Cmp; } else Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1, ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), CCVal, DAG, dl); EVT VT = Op->getValueType(0); LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); } // The basic add/sub long vector instructions have variants with "2" on the end // which act on the high-half of their inputs. They are normally matched by // patterns like: // // (add (zeroext (extract_high LHS)), // (zeroext (extract_high RHS))) // -> uaddl2 vD, vN, vM // // However, if one of the extracts is something like a duplicate, this // instruction can still be used profitably. This function puts the DAG into a // more appropriate form for those patterns to trigger. static SDValue performAddSubLongCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); MVT VT = N->getSimpleValueType(0); if (!VT.is128BitVector()) { if (N->getOpcode() == ISD::ADD) return performSetccAddFolding(N, DAG); return SDValue(); } // Make sure both branches are extended in the same way. SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if ((LHS.getOpcode() != ISD::ZERO_EXTEND && LHS.getOpcode() != ISD::SIGN_EXTEND) || LHS.getOpcode() != RHS.getOpcode()) return SDValue(); unsigned ExtType = LHS.getOpcode(); // It's not worth doing if at least one of the inputs isn't already an // extract, but we don't know which it'll be so we have to try both. if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); if (!RHS.getNode()) return SDValue(); RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); if (!LHS.getNode()) return SDValue(); LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); } return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); } // Massage DAGs which we can use the high-half "long" operations on into // something isel will recognize better. E.g. // // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> // (aarch64_neon_umull (extract_high (v2i64 vec))) // (extract_high (v2i64 (dup128 scalar))))) // static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); assert(LHS.getValueType().is64BitVector() && RHS.getValueType().is64BitVector() && "unexpected shape for long operation"); // Either node could be a DUP, but it's not worth doing both of them (you'd // just as well use the non-high version) so look for a corresponding extract // operation on the other "wing". if (isEssentiallyExtractSubvector(LHS)) { RHS = tryExtendDUPToExtractHigh(RHS, DAG); if (!RHS.getNode()) return SDValue(); } else if (isEssentiallyExtractSubvector(RHS)) { LHS = tryExtendDUPToExtractHigh(LHS, DAG); if (!LHS.getNode()) return SDValue(); } return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), N->getOperand(0), LHS, RHS); } static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { MVT ElemTy = N->getSimpleValueType(0).getScalarType(); unsigned ElemBits = ElemTy.getSizeInBits(); int64_t ShiftAmount; if (BuildVectorSDNode *BVN = dyn_cast(N->getOperand(2))) { APInt SplatValue, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs, ElemBits) || SplatBitSize != ElemBits) return SDValue(); ShiftAmount = SplatValue.getSExtValue(); } else if (ConstantSDNode *CVN = dyn_cast(N->getOperand(2))) { ShiftAmount = CVN->getSExtValue(); } else return SDValue(); unsigned Opcode; bool IsRightShift; switch (IID) { default: llvm_unreachable("Unknown shift intrinsic"); case Intrinsic::aarch64_neon_sqshl: Opcode = AArch64ISD::SQSHL_I; IsRightShift = false; break; case Intrinsic::aarch64_neon_uqshl: Opcode = AArch64ISD::UQSHL_I; IsRightShift = false; break; case Intrinsic::aarch64_neon_srshl: Opcode = AArch64ISD::SRSHR_I; IsRightShift = true; break; case Intrinsic::aarch64_neon_urshl: Opcode = AArch64ISD::URSHR_I; IsRightShift = true; break; case Intrinsic::aarch64_neon_sqshlu: Opcode = AArch64ISD::SQSHLU_I; IsRightShift = false; break; } if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) { SDLoc dl(N); return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), DAG.getConstant(-ShiftAmount, dl, MVT::i32)); } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) { SDLoc dl(N); return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1), DAG.getConstant(ShiftAmount, dl, MVT::i32)); } return SDValue(); } // The CRC32[BH] instructions ignore the high bits of their data operand. Since // the intrinsics must be legal and take an i32, this means there's almost // certainly going to be a zext in the DAG which we can eliminate. static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { SDValue AndN = N->getOperand(2); if (AndN.getOpcode() != ISD::AND) return SDValue(); ConstantSDNode *CMask = dyn_cast(AndN.getOperand(1)); if (!CMask || CMask->getZExtValue() != Mask) return SDValue(); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); } static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, SelectionDAG &DAG) { SDLoc dl(N); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), DAG.getNode(Opc, dl, N->getOperand(1).getSimpleValueType(), N->getOperand(1)), DAG.getConstant(0, dl, MVT::i64)); } static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; unsigned IID = getIntrinsicID(N); switch (IID) { default: break; case Intrinsic::aarch64_neon_vcvtfxs2fp: case Intrinsic::aarch64_neon_vcvtfxu2fp: return tryCombineFixedPointConvert(N, DCI, DAG); case Intrinsic::aarch64_neon_saddv: return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG); case Intrinsic::aarch64_neon_uaddv: return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG); case Intrinsic::aarch64_neon_sminv: return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG); case Intrinsic::aarch64_neon_uminv: return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG); case Intrinsic::aarch64_neon_smaxv: return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG); case Intrinsic::aarch64_neon_umaxv: return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG); case Intrinsic::aarch64_neon_fmax: return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmin: return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fmaxnm: return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_fminnm: return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0), N->getOperand(1), N->getOperand(2)); case Intrinsic::aarch64_neon_smull: case Intrinsic::aarch64_neon_umull: case Intrinsic::aarch64_neon_pmull: case Intrinsic::aarch64_neon_sqdmull: return tryCombineLongOpWithDup(IID, N, DCI, DAG); case Intrinsic::aarch64_neon_sqshl: case Intrinsic::aarch64_neon_uqshl: case Intrinsic::aarch64_neon_sqshlu: case Intrinsic::aarch64_neon_srshl: case Intrinsic::aarch64_neon_urshl: return tryCombineShiftImm(IID, N, DAG); case Intrinsic::aarch64_crc32b: case Intrinsic::aarch64_crc32cb: return tryCombineCRC32(0xff, N, DAG); case Intrinsic::aarch64_crc32h: case Intrinsic::aarch64_crc32ch: return tryCombineCRC32(0xffff, N, DAG); } return SDValue(); } static SDValue performExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then // we can convert that DUP into another extract_high (of a bigger DUP), which // helps the backend to decide that an sabdl2 would be useful, saving a real // extract_high operation. if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { SDNode *ABDNode = N->getOperand(0).getNode(); unsigned IID = getIntrinsicID(ABDNode); if (IID == Intrinsic::aarch64_neon_sabd || IID == Intrinsic::aarch64_neon_uabd) { SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); if (!NewABD.getNode()) return SDValue(); return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD); } } // This is effectively a custom type legalization for AArch64. // // Type legalization will split an extend of a small, legal, type to a larger // illegal type by first splitting the destination type, often creating // illegal source types, which then get legalized in isel-confusing ways, // leading to really terrible codegen. E.g., // %result = v8i32 sext v8i8 %value // becomes // %losrc = extract_subreg %value, ... // %hisrc = extract_subreg %value, ... // %lo = v4i32 sext v4i8 %losrc // %hi = v4i32 sext v4i8 %hisrc // Things go rapidly downhill from there. // // For AArch64, the [sz]ext vector instructions can only go up one element // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 // take two instructions. // // This implies that the most efficient way to do the extend from v8i8 // to two v4i32 values is to first extend the v8i8 to v8i16, then do // the normal splitting to happen for the v8i16->v8i32. // This is pre-legalization to catch some cases where the default // type legalization will create ill-tempered code. if (!DCI.isBeforeLegalizeOps()) return SDValue(); // We're only interested in cleaning things up for non-legal vector types // here. If both the source and destination are legal, things will just // work naturally without any fiddling. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT ResVT = N->getValueType(0); if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) return SDValue(); // If the vector type isn't a simple VT, it's beyond the scope of what // we're worried about here. Let legalization do its thing and hope for // the best. SDValue Src = N->getOperand(0); EVT SrcVT = Src->getValueType(0); if (!ResVT.isSimple() || !SrcVT.isSimple()) return SDValue(); // If the source VT is a 64-bit vector, we can play games and get the // better results we want. if (SrcVT.getSizeInBits() != 64) return SDValue(); unsigned SrcEltSize = SrcVT.getScalarSizeInBits(); unsigned ElementCount = SrcVT.getVectorNumElements(); SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); SDLoc DL(N); Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); // Now split the rest of the operation into two halves, each with a 64 // bit source. EVT LoVT, HiVT; SDValue Lo, Hi; unsigned NumElements = ResVT.getVectorNumElements(); assert(!(NumElements & 1) && "Splitting vector, but not in half!"); LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(), NumElements / 2); EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), LoVT.getVectorNumElements()); Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, DAG.getConstant(0, DL, MVT::i64)); Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64)); Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); // Now combine the parts back together so we still have a single result // like the combiner expects. return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); } static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, SDValue SplatVal, unsigned NumVecElts) { unsigned OrigAlignment = St.getAlignment(); unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8; // Create scalar stores. This is at least as good as the code sequence for a // split unaligned store which is a dup.s, ext.b, and two stores. // Most of the time the three stores should be replaced by store pair // instructions (stp). SDLoc DL(&St); SDValue BasePtr = St.getBasePtr(); uint64_t BaseOffset = 0; const MachinePointerInfo &PtrInfo = St.getPointerInfo(); SDValue NewST1 = DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo, OrigAlignment, St.getMemOperand()->getFlags()); // As this in ISel, we will not merge this add which may degrade results. if (BasePtr->getOpcode() == ISD::ADD && isa(BasePtr->getOperand(1))) { BaseOffset = cast(BasePtr->getOperand(1))->getSExtValue(); BasePtr = BasePtr->getOperand(0); } unsigned Offset = EltOffset; while (--NumVecElts) { unsigned Alignment = MinAlign(OrigAlignment, Offset); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(BaseOffset + Offset, DL, MVT::i64)); NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, PtrInfo.getWithOffset(Offset), Alignment, St.getMemOperand()->getFlags()); Offset += EltOffset; } return NewST1; } /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The /// load store optimizer pass will merge them to store pair stores. This should /// be better than a movi to create the vector zero followed by a vector store /// if the zero constant is not re-used, since one instructions and one register /// live range will be removed. /// /// For example, the final generated code should be: /// /// stp xzr, xzr, [x0] /// /// instead of: /// /// movi v0.2d, #0 /// str q0, [x0] /// static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or // 2, 3 or 4 i32 elements. int NumVecElts = VT.getVectorNumElements(); if (!(((NumVecElts == 2 || NumVecElts == 3) && VT.getVectorElementType().getSizeInBits() == 64) || ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) && VT.getVectorElementType().getSizeInBits() == 32))) return SDValue(); if (StVal.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // If the zero constant has more than one use then the vector store could be // better since the constant mov will be amortized and stp q instructions // should be able to be formed. if (!StVal.hasOneUse()) return SDValue(); // If the immediate offset of the address operand is too large for the stp // instruction, then bail out. if (DAG.isBaseWithConstantOffset(St.getBasePtr())) { int64_t Offset = St.getBasePtr()->getConstantOperandVal(1); if (Offset < -512 || Offset > 504) return SDValue(); } for (int I = 0; I < NumVecElts; ++I) { SDValue EltVal = StVal.getOperand(I); if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal)) return SDValue(); } // Use a CopyFromReg WZR/XZR here to prevent // DAGCombiner::MergeConsecutiveStores from undoing this transformation. SDLoc DL(&St); unsigned ZeroReg; EVT ZeroVT; if (VT.getVectorElementType().getSizeInBits() == 32) { ZeroReg = AArch64::WZR; ZeroVT = MVT::i32; } else { ZeroReg = AArch64::XZR; ZeroVT = MVT::i64; } SDValue SplatVal = DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT); return splitStoreSplat(DAG, St, SplatVal, NumVecElts); } /// Replace a splat of a scalar to a vector store by scalar stores of the scalar /// value. The load store optimizer pass will merge them to store pair stores. /// This has better performance than a splat of the scalar followed by a split /// vector store. Even if the stores are not merged it is four stores vs a dup, /// followed by an ext.b and two stores. static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) { SDValue StVal = St.getValue(); EVT VT = StVal.getValueType(); // Don't replace floating point stores, they possibly won't be transformed to // stp because of the store pair suppress pass. if (VT.isFloatingPoint()) return SDValue(); // We can express a splat as store pair(s) for 2 or 4 elements. unsigned NumVecElts = VT.getVectorNumElements(); if (NumVecElts != 4 && NumVecElts != 2) return SDValue(); // Check that this is a splat. // Make sure that each of the relevant vector element locations are inserted // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32. std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1); SDValue SplatVal; for (unsigned I = 0; I < NumVecElts; ++I) { // Check for insert vector elements. if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) return SDValue(); // Check that same value is inserted at each vector element. if (I == 0) SplatVal = StVal.getOperand(1); else if (StVal.getOperand(1) != SplatVal) return SDValue(); // Check insert element index. ConstantSDNode *CIndex = dyn_cast(StVal.getOperand(2)); if (!CIndex) return SDValue(); uint64_t IndexVal = CIndex->getZExtValue(); if (IndexVal >= NumVecElts) return SDValue(); IndexNotInserted.reset(IndexVal); StVal = StVal.getOperand(0); } // Check that all vector element locations were inserted to. if (IndexNotInserted.any()) return SDValue(); return splitStoreSplat(DAG, St, SplatVal, NumVecElts); } static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { StoreSDNode *S = cast(N); if (S->isVolatile() || S->isIndexed()) return SDValue(); SDValue StVal = S->getValue(); EVT VT = StVal.getValueType(); if (!VT.isVector()) return SDValue(); // If we get a splat of zeros, convert this vector store to a store of // scalars. They will be merged into store pairs of xzr thereby removing one // instruction and one register. if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S)) return ReplacedZeroSplat; // FIXME: The logic for deciding if an unaligned store should be split should // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be // a call to that function here. if (!Subtarget->isMisaligned128StoreSlow()) return SDValue(); // Don't split at -Oz. if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); // Don't split v2i64 vectors. Memcpy lowering produces those and splitting // those up regresses performance on micro-benchmarks and olden/bh. if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64) return SDValue(); // Split unaligned 16B stores. They are terrible for performance. // Don't split stores with alignment of 1 or 2. Code that uses clang vector // extensions can use this to mark that it does not want splitting to happen // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of // eliminating alignment hazards is only 1 in 8 for alignment of 2. if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || S->getAlignment() <= 2) return SDValue(); // If we get a splat of a scalar convert this vector store to a store of // scalars. They will be merged into store pairs thereby removing two // instructions. if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S)) return ReplacedSplat; SDLoc DL(S); unsigned NumElts = VT.getVectorNumElements() / 2; // Split VT into two. EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(0, DL, MVT::i64)); SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, DAG.getConstant(NumElts, DL, MVT::i64)); SDValue BasePtr = S->getBasePtr(); SDValue NewST1 = DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), S->getAlignment(), S->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, DAG.getConstant(8, DL, MVT::i64)); return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, S->getPointerInfo(), S->getAlignment(), S->getMemOperand()->getFlags()); } /// Target-specific DAG combine function for post-increment LD1 (lane) and /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, bool IsLaneOp) { if (DCI.isBeforeLegalizeOps()) return SDValue(); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); unsigned LoadIdx = IsLaneOp ? 1 : 0; SDNode *LD = N->getOperand(LoadIdx).getNode(); // If it is not LOAD, can not do such combine. if (LD->getOpcode() != ISD::LOAD) return SDValue(); // The vector lane must be a constant in the LD1LANE opcode. SDValue Lane; if (IsLaneOp) { Lane = N->getOperand(2); auto *LaneC = dyn_cast(Lane); if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements()) return SDValue(); } LoadSDNode *LoadSDN = cast(LD); EVT MemVT = LoadSDN->getMemoryVT(); // Check if memory operand is the same type as the vector element. if (MemVT != VT.getVectorElementType()) return SDValue(); // Check if there are other uses. If so, do not combine as it will introduce // an extra load. for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; ++UI) { if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. continue; if (*UI != N) return SDValue(); } SDValue Addr = LD->getOperand(1); SDValue Vector = N->getOperand(0); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // Check that the add is independent of the load. Otherwise, folding it // would create a cycle. if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) continue; // Also check that add is not used in the vector operand. This would also // create a cycle. if (User->isPredecessorOf(Vector.getNode())) continue; // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { uint32_t IncVal = CInc->getZExtValue(); unsigned NumBytes = VT.getScalarSizeInBits() / 8; if (IncVal != NumBytes) continue; Inc = DAG.getRegister(AArch64::XZR, MVT::i64); } // Finally, check that the vector doesn't depend on the load. // Again, this would create a cycle. // The load depending on the vector is fine, as that's the case for the // LD1*post we'll eventually generate anyway. if (LoadSDN->isPredecessorOf(Vector.getNode())) continue; SmallVector Ops; Ops.push_back(LD->getOperand(0)); // Chain if (IsLaneOp) { Ops.push_back(Vector); // The vector to be inserted Ops.push_back(Lane); // The lane to be inserted in the vector } Ops.push_back(Addr); Ops.push_back(Inc); EVT Tys[3] = { VT, MVT::i64, MVT::Other }; SDVTList SDTys = DAG.getVTList(Tys); unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, MemVT, LoadSDN->getMemOperand()); // Update the uses. SDValue NewResults[] = { SDValue(LD, 0), // The result of load SDValue(UpdN.getNode(), 2) // Chain }; DCI.CombineTo(LD, NewResults); DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register break; } return SDValue(); } /// Simplify ``Addr`` given that the top byte of it is ignored by HW during /// address translation. static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { APInt DemandedMask = APInt::getLowBitsSet(64, 56); KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); return true; } return false; } static SDValue performSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { if (SDValue Split = splitStores(N, DCI, DAG, Subtarget)) return Split; if (Subtarget->supportsAddressTopByteIgnored() && performTBISimplification(N->getOperand(2), DCI, DAG)) return SDValue(N, 0); return SDValue(); } /// Target-specific DAG combine function for NEON load/store intrinsics /// to merge base address updates. static SDValue performNEONPostLDSTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); unsigned AddrOpIdx = N->getNumOperands() - 1; SDValue Addr = N->getOperand(AddrOpIdx); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // Check that the add is independent of the load/store. Otherwise, folding // it would create a cycle. if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) continue; // Find the new opcode for the updating load/store. bool IsStore = false; bool IsLaneOp = false; bool IsDupOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default: llvm_unreachable("unexpected intrinsic for Neon base update"); case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; NumVecs = 2; break; case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; NumVecs = 3; break; case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; NumVecs = 4; break; case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; NumVecs = 2; IsStore = true; break; case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; NumVecs = 3; IsStore = true; break; case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; NumVecs = 4; IsStore = true; break; case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; NumVecs = 2; break; case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; NumVecs = 3; break; case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; NumVecs = 4; break; case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; NumVecs = 2; IsStore = true; break; case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; NumVecs = 3; IsStore = true; break; case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; NumVecs = 4; IsStore = true; break; case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; NumVecs = 2; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; NumVecs = 3; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; NumVecs = 4; IsDupOp = true; break; case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; NumVecs = 2; IsLaneOp = true; break; case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; NumVecs = 3; IsLaneOp = true; break; case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; NumVecs = 4; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; NumVecs = 2; IsStore = true; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; NumVecs = 3; IsStore = true; IsLaneOp = true; break; case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; NumVecs = 4; IsStore = true; IsLaneOp = true; break; } EVT VecTy; if (IsStore) VecTy = N->getOperand(2).getValueType(); else VecTy = N->getValueType(0); // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); if (ConstantSDNode *CInc = dyn_cast(Inc.getNode())) { uint32_t IncVal = CInc->getZExtValue(); unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (IsLaneOp || IsDupOp) NumBytes /= VecTy.getVectorNumElements(); if (IncVal != NumBytes) continue; Inc = DAG.getRegister(AArch64::XZR, MVT::i64); } SmallVector Ops; Ops.push_back(N->getOperand(0)); // Incoming chain // Load lane and store have vector list as input. if (IsLaneOp || IsStore) for (unsigned i = 2; i < AddrOpIdx; ++i) Ops.push_back(N->getOperand(i)); Ops.push_back(Addr); // Base register Ops.push_back(Inc); // Return Types. EVT Tys[6]; unsigned NumResultVecs = (IsStore ? 0 : NumVecs); unsigned n; for (n = 0; n < NumResultVecs; ++n) Tys[n] = VecTy; Tys[n++] = MVT::i64; // Type of write back register Tys[n] = MVT::Other; // Type of the chain SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); MemIntrinsicSDNode *MemInt = cast(N); SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, MemInt->getMemoryVT(), MemInt->getMemOperand()); // Update the uses. std::vector NewResults; for (unsigned i = 0; i < NumResultVecs; ++i) { NewResults.push_back(SDValue(UpdN.getNode(), i)); } NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); break; } return SDValue(); } // Checks to see if the value is the prescribed width and returns information // about its extension mode. static bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { ExtType = ISD::NON_EXTLOAD; switch(V.getNode()->getOpcode()) { default: return false; case ISD::LOAD: { LoadSDNode *LoadNode = cast(V.getNode()); if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { ExtType = LoadNode->getExtensionType(); return true; } return false; } case ISD::AssertSext: { VTSDNode *TypeNode = cast(V.getNode()->getOperand(1)); if ((TypeNode->getVT() == MVT::i8 && width == 8) || (TypeNode->getVT() == MVT::i16 && width == 16)) { ExtType = ISD::SEXTLOAD; return true; } return false; } case ISD::AssertZext: { VTSDNode *TypeNode = cast(V.getNode()->getOperand(1)); if ((TypeNode->getVT() == MVT::i8 && width == 8) || (TypeNode->getVT() == MVT::i16 && width == 16)) { ExtType = ISD::ZEXTLOAD; return true; } return false; } case ISD::Constant: case ISD::TargetConstant: { return std::abs(cast(V.getNode())->getSExtValue()) < 1LL << (width - 1); } } return true; } // This function does a whole lot of voodoo to determine if the tests are // equivalent without and with a mask. Essentially what happens is that given a // DAG resembling: // // +-------------+ +-------------+ +-------------+ +-------------+ // | Input | | AddConstant | | CompConstant| | CC | // +-------------+ +-------------+ +-------------+ +-------------+ // | | | | // V V | +----------+ // +-------------+ +----+ | | // | ADD | |0xff| | | // +-------------+ +----+ | | // | | | | // V V | | // +-------------+ | | // | AND | | | // +-------------+ | | // | | | // +-----+ | | // | | | // V V V // +-------------+ // | CMP | // +-------------+ // // The AND node may be safely removed for some combinations of inputs. In // particular we need to take into account the extension type of the Input, // the exact values of AddConstant, CompConstant, and CC, along with the nominal // width of the input (this can work for any width inputs, the above graph is // specific to 8 bits. // // The specific equations were worked out by generating output tables for each // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The // problem was simplified by working with 4 bit inputs, which means we only // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 // patterns present in both extensions (0,7). For every distinct set of // AddConstant and CompConstants bit patterns we can consider the masked and // unmasked versions to be equivalent if the result of this function is true for // all 16 distinct bit patterns of for the current extension type of Input (w0). // // sub w8, w0, w1 // and w10, w8, #0x0f // cmp w8, w2 // cset w9, AArch64CC // cmp w10, w2 // cset w11, AArch64CC // cmp w9, w11 // cset w0, eq // ret // // Since the above function shows when the outputs are equivalent it defines // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and // would be expensive to run during compiles. The equations below were written // in a test harness that confirmed they gave equivalent outputs to the above // for all inputs function, so they can be used determine if the removal is // legal instead. // // isEquivalentMaskless() is the code for testing if the AND can be removed // factored out of the DAG recognition as the DAG can take several forms. static bool isEquivalentMaskless(unsigned CC, unsigned width, ISD::LoadExtType ExtType, int AddConstant, int CompConstant) { // By being careful about our equations and only writing the in term // symbolic values and well known constants (0, 1, -1, MaxUInt) we can // make them generally applicable to all bit widths. int MaxUInt = (1 << width); // For the purposes of these comparisons sign extending the type is // equivalent to zero extending the add and displacing it by half the integer // width. Provided we are careful and make sure our equations are valid over // the whole range we can just adjust the input and avoid writing equations // for sign extended inputs. if (ExtType == ISD::SEXTLOAD) AddConstant -= (1 << (width-1)); switch(CC) { case AArch64CC::LE: case AArch64CC::GT: if ((AddConstant == 0) || (CompConstant == MaxUInt - 1 && AddConstant < 0) || (AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) return true; break; case AArch64CC::LT: case AArch64CC::GE: if ((AddConstant == 0) || (AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) return true; break; case AArch64CC::HI: case AArch64CC::LS: if ((AddConstant >= 0 && CompConstant < 0) || (AddConstant <= 0 && CompConstant >= -1 && CompConstant < AddConstant + MaxUInt)) return true; break; case AArch64CC::PL: case AArch64CC::MI: if ((AddConstant == 0) || (AddConstant > 0 && CompConstant <= 0) || (AddConstant < 0 && CompConstant <= AddConstant)) return true; break; case AArch64CC::LO: case AArch64CC::HS: if ((AddConstant >= 0 && CompConstant <= 0) || (AddConstant <= 0 && CompConstant >= 0 && CompConstant <= AddConstant + MaxUInt)) return true; break; case AArch64CC::EQ: case AArch64CC::NE: if ((AddConstant > 0 && CompConstant < 0) || (AddConstant < 0 && CompConstant >= 0 && CompConstant < AddConstant + MaxUInt) || (AddConstant >= 0 && CompConstant >= 0 && CompConstant >= AddConstant) || (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) return true; break; case AArch64CC::VS: case AArch64CC::VC: case AArch64CC::AL: case AArch64CC::NV: return true; case AArch64CC::Invalid: break; } return false; } static SDValue performCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, unsigned CCIndex, unsigned CmpIndex) { unsigned CC = cast(N->getOperand(CCIndex))->getSExtValue(); SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); unsigned CondOpcode = SubsNode->getOpcode(); if (CondOpcode != AArch64ISD::SUBS) return SDValue(); // There is a SUBS feeding this condition. Is it fed by a mask we can // use? SDNode *AndNode = SubsNode->getOperand(0).getNode(); unsigned MaskBits = 0; if (AndNode->getOpcode() != ISD::AND) return SDValue(); if (ConstantSDNode *CN = dyn_cast(AndNode->getOperand(1))) { uint32_t CNV = CN->getZExtValue(); if (CNV == 255) MaskBits = 8; else if (CNV == 65535) MaskBits = 16; } if (!MaskBits) return SDValue(); SDValue AddValue = AndNode->getOperand(0); if (AddValue.getOpcode() != ISD::ADD) return SDValue(); // The basic dag structure is correct, grab the inputs and validate them. SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); SDValue SubsInputValue = SubsNode->getOperand(1); // The mask is present and the provenance of all the values is a smaller type, // lets see if the mask is superfluous. if (!isa(AddInputValue2.getNode()) || !isa(SubsInputValue.getNode())) return SDValue(); ISD::LoadExtType ExtType; if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || !checkValueWidth(AddInputValue2, MaskBits, ExtType) || !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) return SDValue(); if(!isEquivalentMaskless(CC, MaskBits, ExtType, cast(AddInputValue2.getNode())->getSExtValue(), cast(SubsInputValue.getNode())->getSExtValue())) return SDValue(); // The AND is not necessary, remove it. SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), SubsNode->getValueType(1)); SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); return SDValue(N, 0); } // Optimize compare with zero and branch. static SDValue performBRCONDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3)) N = NV.getNode(); SDValue Chain = N->getOperand(0); SDValue Dest = N->getOperand(1); SDValue CCVal = N->getOperand(2); SDValue Cmp = N->getOperand(3); assert(isa(CCVal) && "Expected a ConstantSDNode here!"); unsigned CC = cast(CCVal)->getZExtValue(); if (CC != AArch64CC::EQ && CC != AArch64CC::NE) return SDValue(); unsigned CmpOpc = Cmp.getOpcode(); if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) return SDValue(); // Only attempt folding if there is only one use of the flag and no use of the // value. if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) return SDValue(); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); assert(LHS.getValueType() == RHS.getValueType() && "Expected the value type to be the same for both operands!"); if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) return SDValue(); if (isNullConstant(LHS)) std::swap(LHS, RHS); if (!isNullConstant(RHS)) return SDValue(); if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || LHS.getOpcode() == ISD::SRL) return SDValue(); // Fold the compare into the branch instruction. SDValue BR; if (CC == AArch64CC::EQ) BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); else BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, BR, false); return SDValue(); } // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test // as well as whether the test should be inverted. This code is required to // catch these cases (as opposed to standard dag combines) because // AArch64ISD::TBZ is matched during legalization. static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert, SelectionDAG &DAG) { if (!Op->hasOneUse()) return Op; // We don't handle undef/constant-fold cases below, as they should have // already been taken care of (e.g. and of 0, test of undefined shifted bits, // etc.) // (tbz (trunc x), b) -> (tbz x, b) // This case is just here to enable more of the below cases to be caught. if (Op->getOpcode() == ISD::TRUNCATE && Bit < Op->getValueType(0).getSizeInBits()) { return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } if (Op->getNumOperands() != 2) return Op; auto *C = dyn_cast(Op->getOperand(1)); if (!C) return Op; switch (Op->getOpcode()) { default: return Op; // (tbz (and x, m), b) -> (tbz x, b) case ISD::AND: if ((C->getZExtValue() >> Bit) & 1) return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); return Op; // (tbz (shl x, c), b) -> (tbz x, b-c) case ISD::SHL: if (C->getZExtValue() <= Bit && (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { Bit = Bit - C->getZExtValue(); return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } return Op; // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x case ISD::SRA: Bit = Bit + C->getZExtValue(); if (Bit >= Op->getValueType(0).getSizeInBits()) Bit = Op->getValueType(0).getSizeInBits() - 1; return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); // (tbz (srl x, c), b) -> (tbz x, b+c) case ISD::SRL: if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) { Bit = Bit + C->getZExtValue(); return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } return Op; // (tbz (xor x, -1), b) -> (tbnz x, b) case ISD::XOR: if ((C->getZExtValue() >> Bit) & 1) Invert = !Invert; return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG); } } // Optimize test single bit zero/non-zero and branch. static SDValue performTBZCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { unsigned Bit = cast(N->getOperand(2))->getZExtValue(); bool Invert = false; SDValue TestSrc = N->getOperand(1); SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG); if (TestSrc == NewTestSrc) return SDValue(); unsigned NewOpc = N->getOpcode(); if (Invert) { if (NewOpc == AArch64ISD::TBZ) NewOpc = AArch64ISD::TBNZ; else { assert(NewOpc == AArch64ISD::TBNZ); NewOpc = AArch64ISD::TBZ; } } SDLoc DL(N); return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc, DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3)); } // vselect (v1i1 setcc) -> // vselect (v1iXX setcc) (XX is the size of the compared operand type) // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine // such VSELECT. static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); EVT CCVT = N0.getValueType(); if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || CCVT.getVectorElementType() != MVT::i1) return SDValue(); EVT ResVT = N->getValueType(0); EVT CmpVT = N0.getOperand(0).getValueType(); // Only combine when the result type is of the same size as the compared // operands. if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) return SDValue(); SDValue IfTrue = N->getOperand(1); SDValue IfFalse = N->getOperand(2); SDValue SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), N0.getOperand(0), N0.getOperand(1), cast(N0.getOperand(2))->get()); return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, IfTrue, IfFalse); } /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with /// the compare-mask instructions rather than going via NZCV, even if LHS and /// RHS are really scalar. This replaces any scalar setcc in the above pattern /// with a vector one followed by a DUP shuffle on the result. static SDValue performSelectCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); EVT ResVT = N->getValueType(0); if (N0.getOpcode() != ISD::SETCC) return SDValue(); // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered // scalar SetCCResultType. We also don't expect vectors, because we assume // that selects fed by vector SETCCs are canonicalized to VSELECT. assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) && "Scalar-SETCC feeding SELECT has unexpected result type!"); // If NumMaskElts == 0, the comparison is larger than select result. The // largest real NEON comparison is 64-bits per lane, which means the result is // at most 32-bits and an illegal vector. Just bail out for now. EVT SrcVT = N0.getOperand(0).getValueType(); // Don't try to do this optimization when the setcc itself has i1 operands. // There are no legal vectors of i1, so this would be pointless. if (SrcVT == MVT::i1) return SDValue(); int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); if (!ResVT.isVector() || NumMaskElts == 0) return SDValue(); SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); // Also bail out if the vector CCVT isn't the same size as ResVT. // This can happen if the SETCC operand size doesn't divide the ResVT size // (e.g., f64 vs v3f32). if (CCVT.getSizeInBits() != ResVT.getSizeInBits()) return SDValue(); // Make sure we didn't create illegal types, if we're not supposed to. assert(DCI.isBeforeLegalize() || DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)); // First perform a vector comparison, where lane 0 is the one we're interested // in. SDLoc DL(N0); SDValue LHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); SDValue RHS = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); // Now duplicate the comparison mask we want across all other lanes. SmallVector DUPMask(CCVT.getVectorNumElements(), 0); SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask); Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(), Mask); return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); } /// Get rid of unnecessary NVCASTs (that don't change the type). static SDValue performNVCASTCombine(SDNode *N) { if (N->getValueType(0) == N->getOperand(0).getValueType()) return N->getOperand(0); return SDValue(); } // If all users of the globaladdr are of the form (globaladdr + constant), find // the smallest constant, fold it into the globaladdr's offset and rewrite the // globaladdr as (globaladdr + constant) - constant. static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, const AArch64Subtarget *Subtarget, const TargetMachine &TM) { auto *GN = dyn_cast(N); if (!GN || Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) != AArch64II::MO_NO_FLAG) return SDValue(); uint64_t MinOffset = -1ull; for (SDNode *N : GN->uses()) { if (N->getOpcode() != ISD::ADD) return SDValue(); auto *C = dyn_cast(N->getOperand(0)); if (!C) C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); MinOffset = std::min(MinOffset, C->getZExtValue()); } uint64_t Offset = MinOffset + GN->getOffset(); // Require that the new offset is larger than the existing one. Otherwise, we // can end up oscillating between two possible DAGs, for example, // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1). if (Offset <= uint64_t(GN->getOffset())) return SDValue(); // Check whether folding this offset is legal. It must not go out of bounds of // the referenced object to avoid violating the code model, and must be // smaller than 2^21 because this is the largest offset expressible in all // object formats. // // This check also prevents us from folding negative offsets, which will end // up being treated in the same way as large positive ones. They could also // cause code model violations, and aren't really common enough to matter. if (Offset >= (1 << 21)) return SDValue(); const GlobalValue *GV = GN->getGlobal(); Type *T = GV->getValueType(); if (!T->isSized() || Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) return SDValue(); SDLoc DL(GN); SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset); return DAG.getNode(ISD::SUB, DL, MVT::i64, Result, DAG.getConstant(MinOffset, DL, MVT::i64)); } SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; switch (N->getOpcode()) { default: LLVM_DEBUG(dbgs() << "Custom combining: skipping\n"); break; case ISD::ADD: case ISD::SUB: return performAddSubLongCombine(N, DCI, DAG); case ISD::XOR: return performXorCombine(N, DAG, DCI, Subtarget); case ISD::MUL: return performMulCombine(N, DAG, DCI, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performIntToFpCombine(N, DAG, Subtarget); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return performFpToIntCombine(N, DAG, DCI, Subtarget); case ISD::FDIV: return performFDivCombine(N, DAG, DCI, Subtarget); case ISD::OR: return performORCombine(N, DCI, Subtarget); case ISD::SRL: return performSRLCombine(N, DCI); case ISD::INTRINSIC_WO_CHAIN: return performIntrinsicCombine(N, DCI, Subtarget); case ISD::ANY_EXTEND: case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: return performExtendCombine(N, DCI, DAG); case ISD::BITCAST: return performBitcastCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); case ISD::SELECT: return performSelectCombine(N, DCI); case ISD::VSELECT: return performVSelectCombine(N, DCI.DAG); case ISD::LOAD: if (performTBISimplification(N->getOperand(1), DCI, DAG)) return SDValue(N, 0); break; case ISD::STORE: return performSTORECombine(N, DCI, DAG, Subtarget); case AArch64ISD::BRCOND: return performBRCONDCombine(N, DCI, DAG); case AArch64ISD::TBNZ: case AArch64ISD::TBZ: return performTBZCombine(N, DCI, DAG); case AArch64ISD::CSEL: return performCONDCombine(N, DCI, DAG, 2, 3); case AArch64ISD::DUP: return performPostLD1Combine(N, DCI, false); case AArch64ISD::NVCAST: return performNVCASTCombine(N); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: case Intrinsic::aarch64_neon_ld1x2: case Intrinsic::aarch64_neon_ld1x3: case Intrinsic::aarch64_neon_ld1x4: case Intrinsic::aarch64_neon_ld2lane: case Intrinsic::aarch64_neon_ld3lane: case Intrinsic::aarch64_neon_ld4lane: case Intrinsic::aarch64_neon_ld2r: case Intrinsic::aarch64_neon_ld3r: case Intrinsic::aarch64_neon_ld4r: case Intrinsic::aarch64_neon_st2: case Intrinsic::aarch64_neon_st3: case Intrinsic::aarch64_neon_st4: case Intrinsic::aarch64_neon_st1x2: case Intrinsic::aarch64_neon_st1x3: case Intrinsic::aarch64_neon_st1x4: case Intrinsic::aarch64_neon_st2lane: case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: return performNEONPostLDSTCombine(N, DCI, DAG); default: break; } case ISD::GlobalAddress: return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine()); } return SDValue(); } // Check if the return value is used as only a return value, as otherwise // we can't perform a tail-call. In particular, we need to check for // target ISD nodes that are returns and any other "odd" constructs // that the generic analysis code won't necessarily catch. bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode *Node : Copy->uses()) { if (Node->getOpcode() != AArch64ISD::RET_FLAG) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } // Return whether the an instruction can potentially be optimized to a tail // call. This will cause the optimizers to attempt to move, or duplicate, // return instructions to help enable tail call optimizations for this // instruction. bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { return CI->isTailCall(); } bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const { if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) return false; Base = Op->getOperand(0); // All of the indexed addressing mode instructions take a signed // 9 bit immediate offset. if (ConstantSDNode *RHS = dyn_cast(Op->getOperand(1))) { int64_t RHSC = RHS->getSExtValue(); if (Op->getOpcode() == ISD::SUB) RHSC = -(uint64_t)RHSC; if (!isInt<9>(RHSC)) return false; IsInc = (Op->getOpcode() == ISD::ADD); Offset = Op->getOperand(1); return true; } return false; } bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); } else return false; bool IsInc; if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) return false; AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; return true; } bool AArch64TargetLowering::getPostIndexedAddressParts( SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); } else return false; bool IsInc; if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) return false; // Post-indexing updates the base, so it's not a valid transform // if that's not the same as the load's pointer. if (Ptr != Base) return false; AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; return true; } static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) { SDLoc DL(N); SDValue Op = N->getOperand(0); if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) return; Op = SDValue( DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, DAG.getUNDEF(MVT::i32), Op, DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)), 0); Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); } static void ReplaceReductionResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, unsigned InterOp, unsigned AcrossOp) { EVT LoVT, HiVT; SDValue Lo, Hi; SDLoc dl(N); std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi); SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal); Results.push_back(SplitVal); } static std::pair splitInt128(SDValue N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N); SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, DAG.getNode(ISD::SRL, DL, MVT::i128, N, DAG.getConstant(64, DL, MVT::i64))); return std::make_pair(Lo, Hi); } // Create an even/odd pair of X registers holding integer value V. static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { SDLoc dl(V.getNode()); SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64); SDValue VHi = DAG.getAnyExtOrTrunc( DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)), dl, MVT::i64); if (DAG.getDataLayout().isBigEndian()) std::swap (VLo, VHi); SDValue RegClass = DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32); SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32); SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32); const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; return SDValue( DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); } static void ReplaceCMP_SWAP_128Results(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { assert(N->getValueType(0) == MVT::i128 && "AtomicCmpSwap on types less than 128 should be legal"); if (Subtarget->hasLSE()) { // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type, // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG. SDValue Ops[] = { createGPRPairNode(DAG, N->getOperand(2)), // Compare value createGPRPairNode(DAG, N->getOperand(3)), // Store value N->getOperand(1), // Ptr N->getOperand(0), // Chain in }; MachineFunction &MF = DAG.getMachineFunction(); MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); MemOp[0] = cast(N)->getMemOperand(); unsigned Opcode; switch (MemOp[0]->getOrdering()) { case AtomicOrdering::Monotonic: Opcode = AArch64::CASPX; break; case AtomicOrdering::Acquire: Opcode = AArch64::CASPAX; break; case AtomicOrdering::Release: Opcode = AArch64::CASPLX; break; case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: Opcode = AArch64::CASPALX; break; default: llvm_unreachable("Unexpected ordering!"); } MachineSDNode *CmpSwap = DAG.getMachineNode( Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops); CmpSwap->setMemRefs(MemOp, MemOp + 1); unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64; if (DAG.getDataLayout().isBigEndian()) std::swap(SubReg1, SubReg2); Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64, SDValue(CmpSwap, 0))); Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64, SDValue(CmpSwap, 0))); Results.push_back(SDValue(CmpSwap, 1)); // Chain out return; } auto Desired = splitInt128(N->getOperand(2), DAG); auto New = splitInt128(N->getOperand(3), DAG); SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second, New.first, New.second, N->getOperand(0)}; SDNode *CmpSwap = DAG.getMachineNode( AArch64::CMP_SWAP_128, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops); MachineFunction &MF = DAG.getMachineFunction(); MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); MemOp[0] = cast(N)->getMemOperand(); cast(CmpSwap)->setMemRefs(MemOp, MemOp + 1); Results.push_back(SDValue(CmpSwap, 0)); Results.push_back(SDValue(CmpSwap, 1)); Results.push_back(SDValue(CmpSwap, 3)); } void AArch64TargetLowering::ReplaceNodeResults( SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this"); case ISD::BITCAST: ReplaceBITCASTResults(N, Results, DAG); return; case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: case ISD::VECREDUCE_UMAX: case ISD::VECREDUCE_UMIN: Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG)); return; case AArch64ISD::SADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV); return; case AArch64ISD::UADDV: ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV); return; case AArch64ISD::SMINV: ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV); return; case AArch64ISD::UMINV: ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV); return; case AArch64ISD::SMAXV: ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV); return; case AArch64ISD::UMAXV: ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV); return; case ISD::FP_TO_UINT: case ISD::FP_TO_SINT: assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); // Let normal code take care of it by not adding anything to Results. return; case ISD::ATOMIC_CMP_SWAP: ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); return; } } bool AArch64TargetLowering::useLoadStackGuardNode() const { if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia()) return TargetLowering::useLoadStackGuardNode(); return true; } unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const { // Combine multiple FDIVs with the same divisor into multiple FMULs by the // reciprocal if there are three or more FDIVs. return 3; } TargetLoweringBase::LegalizeTypeAction AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { MVT SVT = VT.getSimpleVT(); // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, // v4i16, v2i32 instead of to promote. if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 || SVT == MVT::v1f32) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); return Size == 128; } // Loads and stores less than 128-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } // For the real atomic operations, we have ldxr/stxr up to 128 bits, TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); if (Size > 128) return AtomicExpansionKind::None; // Nand not supported in LSE. if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; // Leave 128 bits to LLSC. return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC; } bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { // If subtarget has LSE, leave cmpxchg intact for codegen. if (Subtarget->hasLSE()) return false; // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement cmpxchg without spilling. If the address being exchanged is also // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. return getTargetMachine().getOptLevel() != 0; } Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast(Addr->getType())->getElementType(); bool IsAcquire = isAcquireOrStronger(Ord); // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i64, i64} and we have to recombine them into a // single i128 here. if (ValTy->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; Function *Ldxr = Intrinsic::getDeclaration(M, Int); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); } Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateTruncOrBitCast( Builder.CreateCall(Ldxr, Addr), cast(Addr->getType())->getElementType()); } void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilder<> &Builder) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); } Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); bool IsRelease = isReleaseOrStronger(Ord); // Since the intrinsics must have legal type, the i128 intrinsics take two // parameters: "i64, i64". We must marshal Val into the appropriate form // before the call. if (Val->getType()->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; Function *Stxr = Intrinsic::getDeclaration(M, Int); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); } Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; Type *Tys[] = { Addr->getType() }; Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateCall(Stxr, {Builder.CreateZExtOrBitCast( Val, Stxr->getFunctionType()->getParamType(0)), Addr}); } bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { return Ty->isArrayTy(); } bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, EVT) const { return false; } static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset), Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0)); } Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { // Android provides a fixed TLS slot for the stack cookie. See the definition // of TLS_SLOT_STACK_GUARD in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h if (Subtarget->isTargetAndroid()) return UseTlsOffset(IRB, 0x28); // Fuchsia is similar. // defines ZX_TLS_STACK_GUARD_OFFSET with this value. if (Subtarget->isTargetFuchsia()) return UseTlsOffset(IRB, -0x10); return TargetLowering::getIRStackGuard(IRB); } Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { // Android provides a fixed TLS slot for the SafeStack pointer. See the // definition of TLS_SLOT_SAFESTACK in // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h if (Subtarget->isTargetAndroid()) return UseTlsOffset(IRB, 0x48); // Fuchsia is similar. // defines ZX_TLS_UNSAFE_SP_OFFSET with this value. if (Subtarget->isTargetFuchsia()) return UseTlsOffset(IRB, -0x8); return TargetLowering::getSafeStackPointerLocation(IRB); } bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial( const Instruction &AndI) const { // Only sink 'and' mask to cmp use block if it is masking a single bit, since // this is likely to be fold the and/cmp/br into a single tbz instruction. It // may be beneficial to sink in other cases, but we would have to check that // the cmp would not get folded into the br to form a cbz for these to be // beneficial. ConstantInt* Mask = dyn_cast(AndI.getOperand(1)); if (!Mask) return false; return Mask->getValue().isPowerOf2(); } void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in AArch64unctionInfo. AArch64FunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void AArch64TargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (AArch64::GPR64RegClass.contains(*I)) RC = &AArch64::GPR64RegClass; else if (AArch64::FPR64RegClass.contains(*I)) RC = &AArch64::FPR64RegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); unsigned NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { // Integer division on AArch64 is expensive. However, when aggressively // optimizing for code size, we prefer to use a div instruction, as it is // usually smaller than the alternative sequence. // The exception to this is vector division. Since AArch64 doesn't have vector // integer division, leaving the division as-is is a loss even in terms of // size, because it will have to be scalarized, while the alternative code // sequence can be performed in vector form. bool OptSize = Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize); return OptSize && !VT.isVector(); } bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const { return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint(); } unsigned AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) return getPointerTy(DL).getSizeInBits(); return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; } void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } Index: head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h (revision 344055) +++ head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h (revision 344056) @@ -1,711 +1,712 @@ //==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the interfaces that AArch64 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H #include "AArch64.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Instruction.h" namespace llvm { namespace AArch64ISD { enum NodeType : unsigned { FIRST_NUMBER = ISD::BUILTIN_OP_END, WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses. CALL, // Function call. // Produces the full sequence of instructions for getting the thread pointer // offset of a variable into X0, using the TLSDesc model. TLSDESC_CALLSEQ, ADRP, // Page address of a TargetGlobalAddress operand. ADDlow, // Add the low 12 bits of a TargetGlobalAddress operand. LOADgot, // Load from automatically generated descriptor (e.g. Global // Offset Table, TLS record). RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand. BRCOND, // Conditional branch instruction; "b.cond". CSEL, FCSEL, // Conditional move instruction. CSINV, // Conditional select invert. CSNEG, // Conditional select negate. CSINC, // Conditional select increment. // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on // ELF. THREAD_POINTER, ADC, SBC, // adc, sbc instructions // Arithmetic instructions which write flags. ADDS, SUBS, ADCS, SBCS, ANDS, // Conditional compares. Operands: left,right,falsecc,cc,flags CCMP, CCMN, FCCMP, // Floating point comparison FCMP, // Scalar extract EXTR, // Scalar-to-vector duplication DUP, DUPLANE8, DUPLANE16, DUPLANE32, DUPLANE64, // Vector immedate moves MOVI, MOVIshift, MOVIedit, MOVImsl, FMOV, MVNIshift, MVNImsl, // Vector immediate ops BICi, ORRi, // Vector bit select: similar to ISD::VSELECT but not all bits within an // element must be identical. BSL, // Vector arithmetic negation NEG, // Vector shuffles ZIP1, ZIP2, UZP1, UZP2, TRN1, TRN2, REV16, REV32, REV64, EXT, // Vector shift by scalar VSHL, VLSHR, VASHR, // Vector shift by scalar (again) SQSHL_I, UQSHL_I, SQSHLU_I, SRSHR_I, URSHR_I, // Vector comparisons CMEQ, CMGE, CMGT, CMHI, CMHS, FCMEQ, FCMGE, FCMGT, // Vector zero comparisons CMEQz, CMGEz, CMGTz, CMLEz, CMLTz, FCMEQz, FCMGEz, FCMGTz, FCMLEz, FCMLTz, // Vector across-lanes addition // Only the lower result lane is defined. SADDV, UADDV, // Vector across-lanes min/max // Only the lower result lane is defined. SMINV, UMINV, SMAXV, UMAXV, // Vector bitwise negation NOT, // Vector bitwise selection BIT, // Compare-and-branch CBZ, CBNZ, TBZ, TBNZ, // Tail calls TC_RETURN, // Custom prefetch handling PREFETCH, // {s|u}int to FP within a FP register. SITOF, UITOF, /// Natural vector cast. ISD::BITCAST is not natural in the big-endian /// world w.r.t vectors; which causes additional REV instructions to be /// generated to compensate for the byte-swapping. But sometimes we do /// need to re-interpret the data in SIMD vector registers in big-endian /// mode without emitting such REV instructions. NVCAST, SMULL, UMULL, // Reciprocal estimates and steps. FRECPE, FRECPS, FRSQRTE, FRSQRTS, // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, LD4post, ST2post, ST3post, ST4post, LD1x2post, LD1x3post, LD1x4post, ST1x2post, ST1x3post, ST1x4post, LD1DUPpost, LD2DUPpost, LD3DUPpost, LD4DUPpost, LD1LANEpost, LD2LANEpost, LD3LANEpost, LD4LANEpost, ST2LANEpost, ST3LANEpost, ST4LANEpost }; } // end namespace AArch64ISD namespace { // Any instruction that defines a 32-bit result zeros out the high half of the // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may // be copying from a truncate. But any other 32-bit operation will zero-extend // up to 64 bits. // FIXME: X86 also checks for CMOV here. Do we need something similar? static inline bool isDef32(const SDNode &N) { unsigned Opc = N.getOpcode(); return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG && Opc != ISD::CopyFromReg; } } // end anonymous namespace class AArch64Subtarget; class AArch64TargetMachine; class AArch64TargetLowering : public TargetLowering { public: explicit AArch64TargetLowering(const TargetMachine &TM, const AArch64Subtarget &STI); /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const; /// Selects the correct CCAssignFn for a given CallingConvention value. CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const; /// Determine which of the bits specified in Mask are known to be either zero /// or one and return them in the KnownZero/KnownOne bitsets. void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const override; MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override; /// Returns true if the target allows unaligned memory accesses of the /// specified type. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, bool *Fast = nullptr) const override; /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; const char *getTargetNodeName(unsigned Opcode) const override; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { // Addrspacecasts are always noops. return true; } /// This method returns a target specific FastISel object, or null if the /// target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; /// Return true if the given shuffle mask can be codegen'd directly, or if it /// should be stack expanded. bool isShuffleMaskLegal(ArrayRef M, EVT VT) const override; /// Return the ISD::SETCC ValueType. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; MachineBasicBlock *EmitF128CSEL(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override; bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT) const override; bool isTruncateFree(Type *Ty1, Type *Ty2) const override; bool isTruncateFree(EVT VT1, EVT VT2) const override; bool isProfitableToHoist(Instruction *I) const override; bool isZExtFree(Type *Ty1, Type *Ty2) const override; bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; bool shouldConsiderGEPOffsetSplit() const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override; /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I = nullptr) const override; /// Return the cost of the scaling factor used in the addressing /// mode represented by AM for this target, for a load/store /// of the specified type. /// If the AM is supported, the return value must be >= 0. /// If the AM is not supported, it returns a negative value. int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this method /// returns true, otherwise fmuladd is expanded to fmul + fadd. bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; /// Returns false if N is a bit extraction pattern of (X >> C) & Mask. - bool isDesirableToCommuteWithShift(const SDNode *N) const override; + bool isDesirableToCommuteWithShift(const SDNode *N, + CombineLevel Level) const override; /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; /// Return true if EXTRACT_SUBVECTOR is cheap for this result type /// with this index. bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; bool useLoadStackGuardNode() const override; TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(EVT VT) const override; /// If the target has a standard location for the stack protector cookie, /// returns the address of that location. Otherwise, returns nullptr. Value *getIRStackGuard(IRBuilder<> &IRB) const override; /// If the target has a standard location for the unsafe stack pointer, /// returns the address of that location. Otherwise, returns nullptr. Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. unsigned getExceptionPointerRegister(const Constant *PersonalityFn) const override { // FIXME: This is a guess. Has this been defined yet? return AArch64::X0; } /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. unsigned getExceptionSelectorRegister(const Constant *PersonalityFn) const override { // FIXME: This is a guess. Has this been defined yet? return AArch64::X1; } bool isIntDivCheap(EVT VT, AttributeList Attr) const override; bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const override { // Do not merge to float value size (128 bytes) if no implicit // float attribute is set. bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute( Attribute::NoImplicitFloat); if (NoFloat) return (MemVT.getSizeInBits() <= 64); return true; } bool isCheapToSpeculateCttz() const override { return true; } bool isCheapToSpeculateCtlz() const override { return true; } bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override; bool hasAndNotCompare(SDValue V) const override { // We can use bics for any scalar. return V.getValueType().isScalarInteger(); } bool hasAndNot(SDValue Y) const override { EVT VT = Y.getValueType(); if (!VT.isVector()) return hasAndNotCompare(Y); return VT.getSizeInBits() >= 64; // vector 'bic' } bool shouldTransformSignedTruncationCheck(EVT XVT, unsigned KeptBits) const override { // For vectors, we don't have a preference.. if (XVT.isVector()) return false; auto VTIsOk = [](EVT VT) -> bool { return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64; }; // We are ok with KeptBitsVT being byte/word/dword, what SXT supports. // XVT will be larger than KeptBitsVT. MVT KeptBitsVT = MVT::getIntegerVT(KeptBits); return VTIsOk(XVT) && VTIsOk(KeptBitsVT); } bool hasBitPreservingFPLogic(EVT VT) const override { // FIXME: Is this always true? It should be true for vectors at least. return VT == MVT::f32 || VT == MVT::f64; } bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getFunction().hasFnAttribute(Attribute::NoUnwind); } void initializeSplitCSR(MachineBasicBlock *Entry) const override; void insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const override; bool supportSwiftError() const override { return true; } /// Enable aggressive FMA fusion on targets that want it. bool enableAggressiveFMAFusion(EVT VT) const override; /// Returns the size of the platform's va_list object. unsigned getVaListSizeInBits(const DataLayout &DL) const override; /// Returns true if \p VecTy is a legal interleaved access type. This /// function checks the vector element type and the overall width of the /// vector. bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL) const; /// Returns the number of interleaved accesses that will be generated when /// lowering accesses of the given type. unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const; MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; private: /// Keep a pointer to the AArch64Subtarget around so that we can /// make the right decision when generating code for different targets. const AArch64Subtarget *Subtarget; bool isExtFreeImpl(const Instruction *Ext) const override; void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; SDValue LowerCall(CallLoweringInfo & /*CLI*/, SmallVectorImpl &InVals) const override; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG &DAG) const; /// Finds the incoming stack arguments which overlap the given fixed stack /// object and incorporates their load into the current chain. This prevents /// an upcoming store from clobbering the stack argument before it's used. SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const; bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const; void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL, SDValue &Chain) const; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override; SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const; SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const; SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const; SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG, unsigned Flag) const; template SDValue getGOT(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const; template SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const; template SDValue getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const; SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL, SelectionDAG &DAG) const; SDValue LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS, SDValue TVal, SDValue FVal, const SDLoc &dl, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG, RTLIB::Libcall Call) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, SmallVectorImpl &Created) const override; SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override; SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps) const override; unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override; std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; const char *LowerXConstraint(EVT ConstraintVT) const override; void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const override; unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "Q") return InlineAsm::Constraint_Q; // FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are // followed by llvm_unreachable so we'll leave them unimplemented in // the backend for now. return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, bool &IsInc, SelectionDAG &DAG) const; bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; void finalizeLowering(MachineFunction &MF) const override; }; namespace AArch64 { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); } // end namespace AArch64 } // end namespace llvm #endif Index: head/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- head/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp (revision 344055) +++ head/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp (revision 344056) @@ -1,14994 +1,15011 @@ //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the interfaces that ARM uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "ARMISelLowering.h" #include "ARMBaseInstrInfo.h" #include "ARMBaseRegisterInfo.h" #include "ARMCallingConv.h" #include "ARMConstantPoolValue.h" #include "ARMMachineFunctionInfo.h" #include "ARMPerfectShuffle.h" #include "ARMRegisterInfo.h" #include "ARMSelectionDAGInfo.h" #include "ARMSubtarget.h" #include "MCTargetDesc/ARMAddressingModes.h" #include "MCTargetDesc/ARMBaseInfo.h" #include "Utils/ARMBaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" #include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/MC/MCSchedule.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include #include #include #include #include #include #include #include #include #include using namespace llvm; #define DEBUG_TYPE "arm-isel" STATISTIC(NumTailCalls, "Number of tail calls"); STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt"); STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments"); STATISTIC(NumConstpoolPromoted, "Number of constants with their storage promoted into constant pools"); static cl::opt ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true)); static cl::opt EnableConstpoolPromotion( "arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false)); // FIXME: set to true by default once PR32780 is fixed static cl::opt ConstpoolPromotionMaxSize( "arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64)); static cl::opt ConstpoolPromotionMaxTotal( "arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128)); // The APCS parameter registers. static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 }; void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT) { if (VT != PromotedLdStVT) { setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT); setOperationAction(ISD::STORE, VT, Promote); AddPromotedToType (ISD::STORE, VT, PromotedLdStVT); } MVT ElemTy = VT.getVectorElementType(); if (ElemTy != MVT::f64) setOperationAction(ISD::SETCC, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); if (ElemTy == MVT::i32) { setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::UINT_TO_FP, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); } else { setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); } setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Legal); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); if (VT.isInteger()) { setOperationAction(ISD::SHL, VT, Custom); setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::SRL, VT, Custom); } // Promote all bit-wise operations. if (VT.isInteger() && VT != PromotedBitwiseVT) { setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, PromotedBitwiseVT); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT); } // Neon does not support vector divide/remainder operations. setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64) for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}) setOperationAction(Opcode, VT, Legal); } void ARMTargetLowering::addDRTypeForNEON(MVT VT) { addRegisterClass(VT, &ARM::DPRRegClass); addTypeForNEON(VT, MVT::f64, MVT::v2i32); } void ARMTargetLowering::addQRTypeForNEON(MVT VT) { addRegisterClass(VT, &ARM::DPairRegClass); addTypeForNEON(VT, MVT::v2f64, MVT::v4i32); } ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI) : TargetLowering(TM), Subtarget(&STI) { RegInfo = Subtarget->getRegisterInfo(); Itins = Subtarget->getInstrItineraryData(); setBooleanContents(ZeroOrOneBooleanContent); setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() && !Subtarget->isTargetWatchOS()) { bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard; for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID) setLibcallCallingConv(static_cast(LCID), IsHFTarget ? CallingConv::ARM_AAPCS_VFP : CallingConv::ARM_AAPCS); } if (Subtarget->isTargetMachO()) { // Uses VFP for Thumb libfuncs if available. if (Subtarget->isThumb() && Subtarget->hasVFP2() && Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) { static const struct { const RTLIB::Libcall Op; const char * const Name; const ISD::CondCode Cond; } LibraryCalls[] = { // Single-precision floating-point arithmetic. { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID }, { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID }, { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID }, { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID }, // Double-precision floating-point arithmetic. { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID }, { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID }, { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID }, { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID }, // Single-precision comparisons. { RTLIB::OEQ_F32, "__eqsf2vfp", ISD::SETNE }, { RTLIB::UNE_F32, "__nesf2vfp", ISD::SETNE }, { RTLIB::OLT_F32, "__ltsf2vfp", ISD::SETNE }, { RTLIB::OLE_F32, "__lesf2vfp", ISD::SETNE }, { RTLIB::OGE_F32, "__gesf2vfp", ISD::SETNE }, { RTLIB::OGT_F32, "__gtsf2vfp", ISD::SETNE }, { RTLIB::UO_F32, "__unordsf2vfp", ISD::SETNE }, { RTLIB::O_F32, "__unordsf2vfp", ISD::SETEQ }, // Double-precision comparisons. { RTLIB::OEQ_F64, "__eqdf2vfp", ISD::SETNE }, { RTLIB::UNE_F64, "__nedf2vfp", ISD::SETNE }, { RTLIB::OLT_F64, "__ltdf2vfp", ISD::SETNE }, { RTLIB::OLE_F64, "__ledf2vfp", ISD::SETNE }, { RTLIB::OGE_F64, "__gedf2vfp", ISD::SETNE }, { RTLIB::OGT_F64, "__gtdf2vfp", ISD::SETNE }, { RTLIB::UO_F64, "__unorddf2vfp", ISD::SETNE }, { RTLIB::O_F64, "__unorddf2vfp", ISD::SETEQ }, // Floating-point to integer conversions. // i64 conversions are done via library routines even when generating VFP // instructions, so use the same ones. { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp", ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp", ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID }, // Conversions between floating types. { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp", ISD::SETCC_INVALID }, { RTLIB::FPEXT_F32_F64, "__extendsfdf2vfp", ISD::SETCC_INVALID }, // Integer to floating-point conversions. // i64 conversions are done via library routines even when generating VFP // instructions, so use the same ones. // FIXME: There appears to be some naming inconsistency in ARM libgcc: // e.g., __floatunsidf vs. __floatunssidfvfp. { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp", ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp", ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } } } // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); // RTLIB if (Subtarget->isAAPCS_ABI() && (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) { static const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; const ISD::CondCode Cond; } LibraryCalls[] = { // Double-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 2 { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Double-precision floating-point comparison helper functions // RTABI chapter 4.1.2, Table 3 { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UO_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::O_F64, "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, // Single-precision floating-point arithmetic helper functions // RTABI chapter 4.1.2, Table 4 { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Single-precision floating-point comparison helper functions // RTABI chapter 4.1.2, Table 5 { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ }, { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::UO_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE }, { RTLIB::O_F32, "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ }, // Floating-point to integer conversions. // RTABI chapter 4.1.2, Table 6 { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Conversions between floating types. // RTABI chapter 4.1.2, Table 7 { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::FPEXT_F32_F64, "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Integer to floating-point conversions. // RTABI chapter 4.1.2, Table 8 { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Long long helper functions // RTABI chapter 4.2, Table 9 { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, // Integer division functions // RTABI chapter 4.3.1 { RTLIB::SDIV_I8, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SDIV_I16, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SDIV_I32, "__aeabi_idiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::SDIV_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I8, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I16, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I32, "__aeabi_uidiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } // EABI dependent RTLIB if (TM.Options.EABIVersion == EABI::EABI4 || TM.Options.EABIVersion == EABI::EABI5) { static const struct { const RTLIB::Libcall Op; const char *const Name; const CallingConv::ID CC; const ISD::CondCode Cond; } MemOpsLibraryCalls[] = { // Memory operations // RTABI chapter 4.3.4 { RTLIB::MEMCPY, "__aeabi_memcpy", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, { RTLIB::MEMSET, "__aeabi_memset", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID }, }; for (const auto &LC : MemOpsLibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); if (LC.Cond != ISD::SETCC_INVALID) setCmpLibcallCC(LC.Op, LC.Cond); } } } if (Subtarget->isTargetWindows()) { static const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; } LibraryCalls[] = { { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } } // Use divmod compiler-rt calls for iOS 5.0 and later. if (Subtarget->isTargetMachO() && !(Subtarget->isTargetIOS() && Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); } // The half <-> float conversion functions are always soft-float on // non-watchos platforms, but are needed for some targets which use a // hard-float calling convention by default. if (!Subtarget->isTargetWatchABI()) { if (Subtarget->isAAPCS_ABI()) { setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS); setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS); } else { setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS); setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS); setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS); } } // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have // a __gnu_ prefix (which is the default). if (Subtarget->isTargetAEABI()) { static const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; } LibraryCalls[] = { { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS }, { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS }, { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } } if (Subtarget->isThumb1Only()) addRegisterClass(MVT::i32, &ARM::tGPRRegClass); else addRegisterClass(MVT::i32, &ARM::GPRRegClass); if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { addRegisterClass(MVT::f32, &ARM::SPRRegClass); addRegisterClass(MVT::f64, &ARM::DPRRegClass); } if (Subtarget->hasFullFP16()) { addRegisterClass(MVT::f16, &ARM::HPRRegClass); setOperationAction(ISD::BITCAST, MVT::i16, Custom); setOperationAction(ISD::BITCAST, MVT::i32, Custom); setOperationAction(ISD::BITCAST, MVT::f16, Custom); setOperationAction(ISD::FMINNUM, MVT::f16, Legal); setOperationAction(ISD::FMAXNUM, MVT::f16, Legal); } for (MVT VT : MVT::vector_valuetypes()) { for (MVT InnerVT : MVT::vector_valuetypes()) { setTruncStoreAction(VT, InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); } setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); } setOperationAction(ISD::ConstantFP, MVT::f32, Custom); setOperationAction(ISD::ConstantFP, MVT::f64, Custom); setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom); setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom); if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); addDRTypeForNEON(MVT::v4i16); addDRTypeForNEON(MVT::v2i32); addDRTypeForNEON(MVT::v1i64); addQRTypeForNEON(MVT::v4f32); addQRTypeForNEON(MVT::v2f64); addQRTypeForNEON(MVT::v16i8); addQRTypeForNEON(MVT::v8i16); addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); if (Subtarget->hasFullFP16()) { addQRTypeForNEON(MVT::v8f16); addDRTypeForNEON(MVT::v4f16); } // v2f64 is legal so that QR subregs can be extracted as f64 elements, but // neither Neon nor VFP support any arithmetic operations on it. // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively // supported for v4f32. setOperationAction(ISD::FADD, MVT::v2f64, Expand); setOperationAction(ISD::FSUB, MVT::v2f64, Expand); setOperationAction(ISD::FMUL, MVT::v2f64, Expand); // FIXME: Code duplication: FDIV and FREM are expanded always, see // ARMTargetLowering::addTypeForNEON method for details. setOperationAction(ISD::FDIV, MVT::v2f64, Expand); setOperationAction(ISD::FREM, MVT::v2f64, Expand); // FIXME: Create unittest. // In another words, find a way when "copysign" appears in DAG with vector // operands. setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand); // FIXME: Code duplication: SETCC has custom operation action, see // ARMTargetLowering::addTypeForNEON method for details. setOperationAction(ISD::SETCC, MVT::v2f64, Expand); // FIXME: Create unittest for FNEG and for FABS. setOperationAction(ISD::FNEG, MVT::v2f64, Expand); setOperationAction(ISD::FABS, MVT::v2f64, Expand); setOperationAction(ISD::FSQRT, MVT::v2f64, Expand); setOperationAction(ISD::FSIN, MVT::v2f64, Expand); setOperationAction(ISD::FCOS, MVT::v2f64, Expand); setOperationAction(ISD::FPOW, MVT::v2f64, Expand); setOperationAction(ISD::FLOG, MVT::v2f64, Expand); setOperationAction(ISD::FLOG2, MVT::v2f64, Expand); setOperationAction(ISD::FLOG10, MVT::v2f64, Expand); setOperationAction(ISD::FEXP, MVT::v2f64, Expand); setOperationAction(ISD::FEXP2, MVT::v2f64, Expand); // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR. setOperationAction(ISD::FCEIL, MVT::v2f64, Expand); setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand); setOperationAction(ISD::FRINT, MVT::v2f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand); setOperationAction(ISD::FMA, MVT::v2f64, Expand); setOperationAction(ISD::FSQRT, MVT::v4f32, Expand); setOperationAction(ISD::FSIN, MVT::v4f32, Expand); setOperationAction(ISD::FCOS, MVT::v4f32, Expand); setOperationAction(ISD::FPOW, MVT::v4f32, Expand); setOperationAction(ISD::FLOG, MVT::v4f32, Expand); setOperationAction(ISD::FLOG2, MVT::v4f32, Expand); setOperationAction(ISD::FLOG10, MVT::v4f32, Expand); setOperationAction(ISD::FEXP, MVT::v4f32, Expand); setOperationAction(ISD::FEXP2, MVT::v4f32, Expand); setOperationAction(ISD::FCEIL, MVT::v4f32, Expand); setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand); setOperationAction(ISD::FRINT, MVT::v4f32, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand); setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand); // Mark v2f32 intrinsics. setOperationAction(ISD::FSQRT, MVT::v2f32, Expand); setOperationAction(ISD::FSIN, MVT::v2f32, Expand); setOperationAction(ISD::FCOS, MVT::v2f32, Expand); setOperationAction(ISD::FPOW, MVT::v2f32, Expand); setOperationAction(ISD::FLOG, MVT::v2f32, Expand); setOperationAction(ISD::FLOG2, MVT::v2f32, Expand); setOperationAction(ISD::FLOG10, MVT::v2f32, Expand); setOperationAction(ISD::FEXP, MVT::v2f32, Expand); setOperationAction(ISD::FEXP2, MVT::v2f32, Expand); setOperationAction(ISD::FCEIL, MVT::v2f32, Expand); setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand); setOperationAction(ISD::FRINT, MVT::v2f32, Expand); setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand); setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand); // Neon does not support some operations on v1i64 and v2i64 types. setOperationAction(ISD::MUL, MVT::v1i64, Expand); // Custom handling for some quad-vector types to detect VMULL. setOperationAction(ISD::MUL, MVT::v8i16, Custom); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); // Custom handling for some vector types to avoid expensive expansions setOperationAction(ISD::SDIV, MVT::v4i16, Custom); setOperationAction(ISD::SDIV, MVT::v8i8, Custom); setOperationAction(ISD::UDIV, MVT::v4i16, Custom); setOperationAction(ISD::UDIV, MVT::v8i8, Custom); // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with // a destination type that is wider than the source, and nor does // it have a FP_TO_[SU]INT instruction with a narrower destination than // source. setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand); // NEON does not have single instruction CTPOP for vectors with element // types wider than 8-bits. However, custom lowering can leverage the // v8i8/v16i8 vcnt instruction. setOperationAction(ISD::CTPOP, MVT::v2i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i32, Custom); setOperationAction(ISD::CTPOP, MVT::v4i16, Custom); setOperationAction(ISD::CTPOP, MVT::v8i16, Custom); setOperationAction(ISD::CTPOP, MVT::v1i64, Expand); setOperationAction(ISD::CTPOP, MVT::v2i64, Expand); setOperationAction(ISD::CTLZ, MVT::v1i64, Expand); setOperationAction(ISD::CTLZ, MVT::v2i64, Expand); // NEON does not have single instruction CTTZ for vectors. setOperationAction(ISD::CTTZ, MVT::v8i8, Custom); setOperationAction(ISD::CTTZ, MVT::v4i16, Custom); setOperationAction(ISD::CTTZ, MVT::v2i32, Custom); setOperationAction(ISD::CTTZ, MVT::v1i64, Custom); setOperationAction(ISD::CTTZ, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ, MVT::v4i32, Custom); setOperationAction(ISD::CTTZ, MVT::v2i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom); // NEON only has FMA instructions as of VFP4. if (!Subtarget->hasVFP4()) { setOperationAction(ISD::FMA, MVT::v2f32, Expand); setOperationAction(ISD::FMA, MVT::v4f32, Expand); } setTargetDAGCombine(ISD::INTRINSIC_VOID); setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); setTargetDAGCombine(ISD::LOAD); // It is legal to extload from v4i8 to v4i16 or v4i32. for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16, MVT::v2i32}) { for (MVT VT : MVT::integer_vector_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal); setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal); setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal); } } } if (Subtarget->isFPOnlySP()) { // When targeting a floating-point unit with only single-precision // operations, f64 is legal for the few double-precision instructions which // are present However, no double-precision operations other than moves, // loads and stores are provided by the hardware. setOperationAction(ISD::FADD, MVT::f64, Expand); setOperationAction(ISD::FSUB, MVT::f64, Expand); setOperationAction(ISD::FMUL, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FDIV, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FGETSIGN, MVT::f64, Expand); setOperationAction(ISD::FNEG, MVT::f64, Expand); setOperationAction(ISD::FABS, MVT::f64, Expand); setOperationAction(ISD::FSQRT, MVT::f64, Expand); setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FLOG, MVT::f64, Expand); setOperationAction(ISD::FLOG2, MVT::f64, Expand); setOperationAction(ISD::FLOG10, MVT::f64, Expand); setOperationAction(ISD::FEXP, MVT::f64, Expand); setOperationAction(ISD::FEXP2, MVT::f64, Expand); setOperationAction(ISD::FCEIL, MVT::f64, Expand); setOperationAction(ISD::FTRUNC, MVT::f64, Expand); setOperationAction(ISD::FRINT, MVT::f64, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand); setOperationAction(ISD::FFLOOR, MVT::f64, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom); setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); } computeRegisterProperties(Subtarget->getRegisterInfo()); // ARM does not have floating-point extending loads. for (MVT VT : MVT::fp_valuetypes()) { setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); } // ... or truncating stores setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f32, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f16, Expand); // ARM does not have i1 sign extending load. for (MVT VT : MVT::integer_valuetypes()) setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); // ARM supports all 4 flavors of integer indexed load / store. if (!Subtarget->isThumb1Only()) { for (unsigned im = (unsigned)ISD::PRE_INC; im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { setIndexedLoadAction(im, MVT::i1, Legal); setIndexedLoadAction(im, MVT::i8, Legal); setIndexedLoadAction(im, MVT::i16, Legal); setIndexedLoadAction(im, MVT::i32, Legal); setIndexedStoreAction(im, MVT::i1, Legal); setIndexedStoreAction(im, MVT::i8, Legal); setIndexedStoreAction(im, MVT::i16, Legal); setIndexedStoreAction(im, MVT::i32, Legal); } } else { // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}. setIndexedLoadAction(ISD::POST_INC, MVT::i32, Legal); setIndexedStoreAction(ISD::POST_INC, MVT::i32, Legal); } setOperationAction(ISD::SADDO, MVT::i32, Custom); setOperationAction(ISD::UADDO, MVT::i32, Custom); setOperationAction(ISD::SSUBO, MVT::i32, Custom); setOperationAction(ISD::USUBO, MVT::i32, Custom); setOperationAction(ISD::ADDCARRY, MVT::i32, Custom); setOperationAction(ISD::SUBCARRY, MVT::i32, Custom); // i64 operation support. setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::MULHU, MVT::i32, Expand); if (Subtarget->isThumb1Only()) { setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); } if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops() || (Subtarget->isThumb2() && !Subtarget->hasDSP())) setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom); setOperationAction(ISD::SRL, MVT::i64, Custom); setOperationAction(ISD::SRA, MVT::i64, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom); // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1. if (Subtarget->isThumb1Only()) { setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand); setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand); setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand); } if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); // ARM does not have ROTL. setOperationAction(ISD::ROTL, MVT::i32, Expand); for (MVT VT : MVT::vector_valuetypes()) { setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); } setOperationAction(ISD::CTTZ, MVT::i32, Custom); setOperationAction(ISD::CTPOP, MVT::i32, Expand); if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) setOperationAction(ISD::CTLZ, MVT::i32, Expand); // @llvm.readcyclecounter requires the Performance Monitors extension. // Default to the 0 expansion on unsupported platforms. // FIXME: Technically there are older ARM CPUs that have // implementation-specific ways of obtaining this information. if (Subtarget->hasPerfMon()) setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom); // Only ARMv6 has BSWAP. if (!Subtarget->hasV6Ops()) setOperationAction(ISD::BSWAP, MVT::i32, Expand); bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() : Subtarget->hasDivideInARMMode(); if (!hasDivide) { // These are expanded into libcalls if the cpu doesn't have HW divider. setOperationAction(ISD::SDIV, MVT::i32, LibCall); setOperationAction(ISD::UDIV, MVT::i32, LibCall); } if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) { setOperationAction(ISD::SDIV, MVT::i32, Custom); setOperationAction(ISD::UDIV, MVT::i32, Custom); setOperationAction(ISD::SDIV, MVT::i64, Custom); setOperationAction(ISD::UDIV, MVT::i64, Custom); } setOperationAction(ISD::SREM, MVT::i32, Expand); setOperationAction(ISD::UREM, MVT::i32, Expand); // Register based DivRem for AEABI (RTABI 4.2) if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || Subtarget->isTargetWindows()) { setOperationAction(ISD::SREM, MVT::i64, Custom); setOperationAction(ISD::UREM, MVT::i64, Custom); HasStandaloneRem = false; if (Subtarget->isTargetWindows()) { const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; } LibraryCalls[] = { { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } } else { const struct { const RTLIB::Libcall Op; const char * const Name; const CallingConv::ID CC; } LibraryCalls[] = { { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS }, { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS }, { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS }, }; for (const auto &LC : LibraryCalls) { setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } } setOperationAction(ISD::SDIVREM, MVT::i32, Custom); setOperationAction(ISD::UDIVREM, MVT::i32, Custom); setOperationAction(ISD::SDIVREM, MVT::i64, Custom); setOperationAction(ISD::UDIVREM, MVT::i64, Custom); } else { setOperationAction(ISD::SDIVREM, MVT::i32, Expand); setOperationAction(ISD::UDIVREM, MVT::i32, Expand); } if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT()) for (auto &VT : {MVT::f32, MVT::f64}) setOperationAction(ISD::FPOWI, VT, Custom); setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::ConstantPool, MVT::i32, Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom); setOperationAction(ISD::BlockAddress, MVT::i32, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); // Use the default implementation. setOperationAction(ISD::VASTART, MVT::Other, Custom); setOperationAction(ISD::VAARG, MVT::Other, Expand); setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::VAEND, MVT::Other, Expand); setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); if (Subtarget->isTargetWindows()) setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom); else setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand); // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use // the default expansion. InsertFencesForAtomic = false; if (Subtarget->hasAnyDataBarrier() && (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) { // ATOMIC_FENCE needs custom lowering; the others should have been expanded // to ldrex/strex loops already. setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom); if (!Subtarget->isThumb() || !Subtarget->isMClass()) setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom); // On v8, we have particularly efficient implementations of atomic fences // if they can be combined with nearby atomic loads and stores. if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) { // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc. InsertFencesForAtomic = true; } } else { // If there's anything we can use as a barrier, go through custom lowering // for ATOMIC_FENCE. // If target has DMB in thumb, Fences can be inserted. if (Subtarget->hasDataBarrier()) InsertFencesForAtomic = true; setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Subtarget->hasAnyDataBarrier() ? Custom : Expand); // Set them all for expansion, which will force libcalls. setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand); // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the // Unordered/Monotonic case. if (!InsertFencesForAtomic) { setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom); } } setOperationAction(ISD::PREFETCH, MVT::Other, Custom); // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes. if (!Subtarget->hasV6Ops()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand); } setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR // iff target supports vfp2. setOperationAction(ISD::BITCAST, MVT::i64, Custom); setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); if (Subtarget->useSjLjEH()) setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); setOperationAction(ISD::SETCC, MVT::i32, Expand); setOperationAction(ISD::SETCC, MVT::f32, Expand); setOperationAction(ISD::SETCC, MVT::f64, Expand); setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); setOperationAction(ISD::SELECT, MVT::f64, Custom); setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); if (Subtarget->hasFullFP16()) { setOperationAction(ISD::SETCC, MVT::f16, Expand); setOperationAction(ISD::SELECT, MVT::f16, Custom); setOperationAction(ISD::SELECT_CC, MVT::f16, Custom); } setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i32, Custom); if (Subtarget->hasFullFP16()) setOperationAction(ISD::BR_CC, MVT::f16, Custom); setOperationAction(ISD::BR_CC, MVT::f32, Custom); setOperationAction(ISD::BR_CC, MVT::f64, Custom); setOperationAction(ISD::BR_JT, MVT::Other, Custom); // We don't support sin/cos/fmod/copysign/pow setOperationAction(ISD::FSIN, MVT::f64, Expand); setOperationAction(ISD::FSIN, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f32, Expand); setOperationAction(ISD::FCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f32, Expand); if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) { setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); } setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); if (!Subtarget->hasVFP4()) { setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); } // Various VFP goodness if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) { // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded. if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) { setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); } // fp16 is a special v7 extension that adds f16 <-> f32 conversions. if (!Subtarget->hasFP16()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } } // Use __sincos_stret if available. if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } // FP-ARMv8 implements a lot of rounding-like FP operations. if (Subtarget->hasFPARMv8()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); setOperationAction(ISD::FMINNUM, MVT::f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal); setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); if (!Subtarget->isFPOnlySP()) { setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FROUND, MVT::f64, Legal); setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); setOperationAction(ISD::FMINNUM, MVT::f64, Legal); setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); } } if (Subtarget->hasNEON()) { // vmin and vmax aren't available in a scalar form, so we use // a NEON instruction with an undef lane instead. setOperationAction(ISD::FMINNAN, MVT::f16, Legal); setOperationAction(ISD::FMAXNAN, MVT::f16, Legal); setOperationAction(ISD::FMINNAN, MVT::f32, Legal); setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal); setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal); setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); } // We have target-specific dag combine patterns for the following nodes: // ARMISD::VMOVRRD - No need to call setTargetDAGCombine setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::XOR); if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); setStackPointerRegisterToSaveRestore(ARM::SP); if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() || !Subtarget->hasVFP2()) setSchedulingPreference(Sched::RegPressure); else setSchedulingPreference(Sched::Hybrid); //// temporary - rewrite interface to use type MaxStoresPerMemset = 8; MaxStoresPerMemsetOptSize = 4; MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = 2; MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 2; // On ARM arguments smaller than 4 bytes are extended, so all arguments // are at least 4 bytes aligned. setMinStackArgumentAlignment(4); // Prefer likely predicted branches to selects on out-of-order cores. PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder(); setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2); } bool ARMTargetLowering::useSoftFloat() const { return Subtarget->useSoftFloat(); } // FIXME: It might make sense to define the representative register class as the // nearest super-register that has a non-null superset. For example, DPR_VFP2 is // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently, // SPR's representative would be DPR_VFP2. This should work well if register // pressure tracking were modified such that a register use would increment the // pressure of the register class's representative and all of it's super // classes' representatives transitively. We have not implemented this because // of the difficulty prior to coalescing of modeling operand register classes // due to the common occurrence of cross class copies and subregister insertions // and extractions. std::pair ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const { const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(TRI, VT); // Use DPR as representative register class for all floating point // and vector types. Since there are 32 SPR registers and 32 DPR registers so // the cost is 1 for both f32 and f64. case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16: case MVT::v2i32: case MVT::v1i64: case MVT::v2f32: RRC = &ARM::DPRRegClass; // When NEON is used for SP, only half of the register file is available // because operations that define both SP and DP results will be constrained // to the VFP2 class (D0-D15). We currently model this constraint prior to // coalescing by double-counting the SP regs. See the FIXME above. if (Subtarget->useNEONForSinglePrecisionFP()) Cost = 2; break; case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: RRC = &ARM::DPRRegClass; Cost = 2; break; case MVT::v4i64: RRC = &ARM::DPRRegClass; Cost = 4; break; case MVT::v8i64: RRC = &ARM::DPRRegClass; Cost = 8; break; } return std::make_pair(RRC, Cost); } const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((ARMISD::NodeType)Opcode) { case ARMISD::FIRST_NUMBER: break; case ARMISD::Wrapper: return "ARMISD::Wrapper"; case ARMISD::WrapperPIC: return "ARMISD::WrapperPIC"; case ARMISD::WrapperJT: return "ARMISD::WrapperJT"; case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL"; case ARMISD::CALL: return "ARMISD::CALL"; case ARMISD::CALL_PRED: return "ARMISD::CALL_PRED"; case ARMISD::CALL_NOLINK: return "ARMISD::CALL_NOLINK"; case ARMISD::BRCOND: return "ARMISD::BRCOND"; case ARMISD::BR_JT: return "ARMISD::BR_JT"; case ARMISD::BR2_JT: return "ARMISD::BR2_JT"; case ARMISD::RET_FLAG: return "ARMISD::RET_FLAG"; case ARMISD::INTRET_FLAG: return "ARMISD::INTRET_FLAG"; case ARMISD::PIC_ADD: return "ARMISD::PIC_ADD"; case ARMISD::CMP: return "ARMISD::CMP"; case ARMISD::CMN: return "ARMISD::CMN"; case ARMISD::CMPZ: return "ARMISD::CMPZ"; case ARMISD::CMPFP: return "ARMISD::CMPFP"; case ARMISD::CMPFPw0: return "ARMISD::CMPFPw0"; case ARMISD::BCC_i64: return "ARMISD::BCC_i64"; case ARMISD::FMSTAT: return "ARMISD::FMSTAT"; case ARMISD::CMOV: return "ARMISD::CMOV"; case ARMISD::SSAT: return "ARMISD::SSAT"; case ARMISD::USAT: return "ARMISD::USAT"; case ARMISD::SRL_FLAG: return "ARMISD::SRL_FLAG"; case ARMISD::SRA_FLAG: return "ARMISD::SRA_FLAG"; case ARMISD::RRX: return "ARMISD::RRX"; case ARMISD::ADDC: return "ARMISD::ADDC"; case ARMISD::ADDE: return "ARMISD::ADDE"; case ARMISD::SUBC: return "ARMISD::SUBC"; case ARMISD::SUBE: return "ARMISD::SUBE"; case ARMISD::VMOVRRD: return "ARMISD::VMOVRRD"; case ARMISD::VMOVDRR: return "ARMISD::VMOVDRR"; case ARMISD::VMOVhr: return "ARMISD::VMOVhr"; case ARMISD::VMOVrh: return "ARMISD::VMOVrh"; case ARMISD::VMOVSR: return "ARMISD::VMOVSR"; case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP"; case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP"; case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH"; case ARMISD::TC_RETURN: return "ARMISD::TC_RETURN"; case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER"; case ARMISD::DYN_ALLOC: return "ARMISD::DYN_ALLOC"; case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR"; case ARMISD::PRELOAD: return "ARMISD::PRELOAD"; case ARMISD::WIN__CHKSTK: return "ARMISD::WIN__CHKSTK"; case ARMISD::WIN__DBZCHK: return "ARMISD::WIN__DBZCHK"; case ARMISD::VCEQ: return "ARMISD::VCEQ"; case ARMISD::VCEQZ: return "ARMISD::VCEQZ"; case ARMISD::VCGE: return "ARMISD::VCGE"; case ARMISD::VCGEZ: return "ARMISD::VCGEZ"; case ARMISD::VCLEZ: return "ARMISD::VCLEZ"; case ARMISD::VCGEU: return "ARMISD::VCGEU"; case ARMISD::VCGT: return "ARMISD::VCGT"; case ARMISD::VCGTZ: return "ARMISD::VCGTZ"; case ARMISD::VCLTZ: return "ARMISD::VCLTZ"; case ARMISD::VCGTU: return "ARMISD::VCGTU"; case ARMISD::VTST: return "ARMISD::VTST"; case ARMISD::VSHL: return "ARMISD::VSHL"; case ARMISD::VSHRs: return "ARMISD::VSHRs"; case ARMISD::VSHRu: return "ARMISD::VSHRu"; case ARMISD::VRSHRs: return "ARMISD::VRSHRs"; case ARMISD::VRSHRu: return "ARMISD::VRSHRu"; case ARMISD::VRSHRN: return "ARMISD::VRSHRN"; case ARMISD::VQSHLs: return "ARMISD::VQSHLs"; case ARMISD::VQSHLu: return "ARMISD::VQSHLu"; case ARMISD::VQSHLsu: return "ARMISD::VQSHLsu"; case ARMISD::VQSHRNs: return "ARMISD::VQSHRNs"; case ARMISD::VQSHRNu: return "ARMISD::VQSHRNu"; case ARMISD::VQSHRNsu: return "ARMISD::VQSHRNsu"; case ARMISD::VQRSHRNs: return "ARMISD::VQRSHRNs"; case ARMISD::VQRSHRNu: return "ARMISD::VQRSHRNu"; case ARMISD::VQRSHRNsu: return "ARMISD::VQRSHRNsu"; case ARMISD::VSLI: return "ARMISD::VSLI"; case ARMISD::VSRI: return "ARMISD::VSRI"; case ARMISD::VGETLANEu: return "ARMISD::VGETLANEu"; case ARMISD::VGETLANEs: return "ARMISD::VGETLANEs"; case ARMISD::VMOVIMM: return "ARMISD::VMOVIMM"; case ARMISD::VMVNIMM: return "ARMISD::VMVNIMM"; case ARMISD::VMOVFPIMM: return "ARMISD::VMOVFPIMM"; case ARMISD::VDUP: return "ARMISD::VDUP"; case ARMISD::VDUPLANE: return "ARMISD::VDUPLANE"; case ARMISD::VEXT: return "ARMISD::VEXT"; case ARMISD::VREV64: return "ARMISD::VREV64"; case ARMISD::VREV32: return "ARMISD::VREV32"; case ARMISD::VREV16: return "ARMISD::VREV16"; case ARMISD::VZIP: return "ARMISD::VZIP"; case ARMISD::VUZP: return "ARMISD::VUZP"; case ARMISD::VTRN: return "ARMISD::VTRN"; case ARMISD::VTBL1: return "ARMISD::VTBL1"; case ARMISD::VTBL2: return "ARMISD::VTBL2"; case ARMISD::VMULLs: return "ARMISD::VMULLs"; case ARMISD::VMULLu: return "ARMISD::VMULLu"; case ARMISD::UMAAL: return "ARMISD::UMAAL"; case ARMISD::UMLAL: return "ARMISD::UMLAL"; case ARMISD::SMLAL: return "ARMISD::SMLAL"; case ARMISD::SMLALBB: return "ARMISD::SMLALBB"; case ARMISD::SMLALBT: return "ARMISD::SMLALBT"; case ARMISD::SMLALTB: return "ARMISD::SMLALTB"; case ARMISD::SMLALTT: return "ARMISD::SMLALTT"; case ARMISD::SMULWB: return "ARMISD::SMULWB"; case ARMISD::SMULWT: return "ARMISD::SMULWT"; case ARMISD::SMLALD: return "ARMISD::SMLALD"; case ARMISD::SMLALDX: return "ARMISD::SMLALDX"; case ARMISD::SMLSLD: return "ARMISD::SMLSLD"; case ARMISD::SMLSLDX: return "ARMISD::SMLSLDX"; case ARMISD::SMMLAR: return "ARMISD::SMMLAR"; case ARMISD::SMMLSR: return "ARMISD::SMMLSR"; case ARMISD::BUILD_VECTOR: return "ARMISD::BUILD_VECTOR"; case ARMISD::BFI: return "ARMISD::BFI"; case ARMISD::VORRIMM: return "ARMISD::VORRIMM"; case ARMISD::VBICIMM: return "ARMISD::VBICIMM"; case ARMISD::VBSL: return "ARMISD::VBSL"; case ARMISD::MEMCPY: return "ARMISD::MEMCPY"; case ARMISD::VLD1DUP: return "ARMISD::VLD1DUP"; case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP"; case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP"; case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP"; case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD"; case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD"; case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD"; case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD"; case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD"; case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD"; case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD"; case ARMISD::VLD1DUP_UPD: return "ARMISD::VLD1DUP_UPD"; case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD"; case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD"; case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD"; case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD"; case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD"; case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD"; case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD"; case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; } return nullptr; } EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) return getPointerTy(DL); return VT.changeVectorElementTypeToInteger(); } /// getRegClassFor - Return the register class that should be used for the /// specified value type. const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const { // Map v4i64 to QQ registers but do not make the type legal. Similarly map // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to // load / store 4 to 8 consecutive D registers. if (Subtarget->hasNEON()) { if (VT == MVT::v4i64) return &ARM::QQPRRegClass; if (VT == MVT::v8i64) return &ARM::QQQQPRRegClass; } return TargetLowering::getRegClassFor(VT); } // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the // source/dest is aligned and the copy size is large enough. We therefore want // to align such objects passed to memory intrinsics. bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, unsigned &PrefAlign) const { if (!isa(CI)) return false; MinSize = 8; // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1 // cycle faster than 4-byte aligned LDM. PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4); return true; } // Create a fast isel object. FastISel * ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return ARM::createFastISel(funcInfo, libInfo); } Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const { unsigned NumVals = N->getNumValues(); if (!NumVals) return Sched::RegPressure; for (unsigned i = 0; i != NumVals; ++i) { EVT VT = N->getValueType(i); if (VT == MVT::Glue || VT == MVT::Other) continue; if (VT.isFloatingPoint() || VT.isVector()) return Sched::ILP; } if (!N->isMachineOpcode()) return Sched::RegPressure; // Load are scheduled for latency even if there instruction itinerary // is not available. const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); if (MCID.getNumDefs() == 0) return Sched::RegPressure; if (!Itins->isEmpty() && Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2) return Sched::ILP; return Sched::RegPressure; } //===----------------------------------------------------------------------===// // Lowering Code //===----------------------------------------------------------------------===// static bool isSRL16(const SDValue &Op) { if (Op.getOpcode() != ISD::SRL) return false; if (auto Const = dyn_cast(Op.getOperand(1))) return Const->getZExtValue() == 16; return false; } static bool isSRA16(const SDValue &Op) { if (Op.getOpcode() != ISD::SRA) return false; if (auto Const = dyn_cast(Op.getOperand(1))) return Const->getZExtValue() == 16; return false; } static bool isSHL16(const SDValue &Op) { if (Op.getOpcode() != ISD::SHL) return false; if (auto Const = dyn_cast(Op.getOperand(1))) return Const->getZExtValue() == 16; return false; } // Check for a signed 16-bit value. We special case SRA because it makes it // more simple when also looking for SRAs that aren't sign extending a // smaller value. Without the check, we'd need to take extra care with // checking order for some operations. static bool isS16(const SDValue &Op, SelectionDAG &DAG) { if (isSRA16(Op)) return isSHL16(Op.getOperand(0)); return DAG.ComputeNumSignBits(Op) == 17; } /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) { switch (CC) { default: llvm_unreachable("Unknown condition code!"); case ISD::SETNE: return ARMCC::NE; case ISD::SETEQ: return ARMCC::EQ; case ISD::SETGT: return ARMCC::GT; case ISD::SETGE: return ARMCC::GE; case ISD::SETLT: return ARMCC::LT; case ISD::SETLE: return ARMCC::LE; case ISD::SETUGT: return ARMCC::HI; case ISD::SETUGE: return ARMCC::HS; case ISD::SETULT: return ARMCC::LO; case ISD::SETULE: return ARMCC::LS; } } /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC. static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) { CondCode2 = ARMCC::AL; InvalidOnQNaN = true; switch (CC) { default: llvm_unreachable("Unknown FP condition!"); case ISD::SETEQ: case ISD::SETOEQ: CondCode = ARMCC::EQ; InvalidOnQNaN = false; break; case ISD::SETGT: case ISD::SETOGT: CondCode = ARMCC::GT; break; case ISD::SETGE: case ISD::SETOGE: CondCode = ARMCC::GE; break; case ISD::SETOLT: CondCode = ARMCC::MI; break; case ISD::SETOLE: CondCode = ARMCC::LS; break; case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; InvalidOnQNaN = false; break; case ISD::SETO: CondCode = ARMCC::VC; break; case ISD::SETUO: CondCode = ARMCC::VS; break; case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; InvalidOnQNaN = false; break; case ISD::SETUGT: CondCode = ARMCC::HI; break; case ISD::SETUGE: CondCode = ARMCC::PL; break; case ISD::SETLT: case ISD::SETULT: CondCode = ARMCC::LT; break; case ISD::SETLE: case ISD::SETULE: CondCode = ARMCC::LE; break; case ISD::SETNE: case ISD::SETUNE: CondCode = ARMCC::NE; InvalidOnQNaN = false; break; } } //===----------------------------------------------------------------------===// // Calling Convention Implementation //===----------------------------------------------------------------------===// #include "ARMGenCallingConv.inc" /// getEffectiveCallingConv - Get the effective calling convention, taking into /// account presence of floating point hardware and calling convention /// limitations, such as support for variadic functions. CallingConv::ID ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC, bool isVarArg) const { switch (CC) { default: report_fatal_error("Unsupported calling convention"); case CallingConv::ARM_AAPCS: case CallingConv::ARM_APCS: case CallingConv::GHC: return CC; case CallingConv::PreserveMost: return CallingConv::PreserveMost; case CallingConv::ARM_AAPCS_VFP: case CallingConv::Swift: return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP; case CallingConv::C: if (!Subtarget->isAAPCS_ABI()) return CallingConv::ARM_APCS; else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && getTargetMachine().Options.FloatABIType == FloatABI::Hard && !isVarArg) return CallingConv::ARM_AAPCS_VFP; else return CallingConv::ARM_AAPCS; case CallingConv::Fast: case CallingConv::CXX_FAST_TLS: if (!Subtarget->isAAPCS_ABI()) { if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::Fast; return CallingConv::ARM_APCS; } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg) return CallingConv::ARM_AAPCS_VFP; else return CallingConv::ARM_AAPCS; } } CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const { return CCAssignFnForNode(CC, false, isVarArg); } CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const { return CCAssignFnForNode(CC, true, isVarArg); } /// CCAssignFnForNode - Selects the correct CCAssignFn for the given /// CallingConvention. CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const { switch (getEffectiveCallingConv(CC, isVarArg)) { default: report_fatal_error("Unsupported calling convention"); case CallingConv::ARM_APCS: return (Return ? RetCC_ARM_APCS : CC_ARM_APCS); case CallingConv::ARM_AAPCS: return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); case CallingConv::ARM_AAPCS_VFP: return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP); case CallingConv::Fast: return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS); case CallingConv::GHC: return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC); case CallingConv::PreserveMost: return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS); } } /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. SDValue ARMTargetLowering::LowerCallResult( SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const { // Assign locations to each value returned by this call. SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg)); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign VA = RVLocs[i]; // Pass 'this' value directly from the argument to return value, to avoid // reg unit interference if (i == 0 && isThisReturn) { assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 && "unexpected return calling convention register assignment"); InVals.push_back(ThisVal); continue; } SDValue Val; if (VA.needsCustom()) { // Handle f64 or half of a v2f64. SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Lo.getValue(1); InFlag = Lo.getValue(2); VA = RVLocs[++i]; // skip ahead to next loc SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Hi.getValue(1); InFlag = Hi.getValue(2); if (!Subtarget->isLittle()) std::swap (Lo, Hi); Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); if (VA.getLocVT() == MVT::v2f64) { SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, DAG.getConstant(0, dl, MVT::i32)); VA = RVLocs[++i]; // skip ahead to next loc Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Lo.getValue(1); InFlag = Lo.getValue(2); VA = RVLocs[++i]; // skip ahead to next loc Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag); Chain = Hi.getValue(1); InFlag = Hi.getValue(2); if (!Subtarget->isLittle()) std::swap (Lo, Hi); Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val, DAG.getConstant(1, dl, MVT::i32)); } } else { Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(), InFlag); Chain = Val.getValue(1); InFlag = Val.getValue(2); } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val); break; } InVals.push_back(Val); } return Chain; } /// LowerMemOpCallTo - Store the argument to the stack. SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), StackPtr, PtrOff); return DAG.getStore( Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset)); } void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVectorImpl &MemOpChains, ISD::ArgFlagsTy Flags) const { SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); unsigned id = Subtarget->isLittle() ? 0 : 1; RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id))); if (NextVA.isRegLoc()) RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id))); else { assert(NextVA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id), dl, DAG, NextVA, Flags)); } } /// LowerCall - Lowering a call into a callseq_start <- /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter /// nodes. SDValue ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl &Outs = CLI.Outs; SmallVectorImpl &OutVals = CLI.OutVals; SmallVectorImpl &Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &isTailCall = CLI.IsTailCall; CallingConv::ID CallConv = CLI.CallConv; bool doesNotRet = CLI.DoesNotReturn; bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); bool isThisReturn = false; bool isSibCall = false; auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); // Disable tail calls if they're not supported. if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true") isTailCall = false; if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG); if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall()) report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); // We don't support GuaranteedTailCallOpt for ARM, only automatically // detected sibcalls. if (isTailCall) { ++NumTailCalls; isSibCall = true; } } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg)); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); // For tail calls, memory operands are available in our caller's stack. if (isSibCall) NumBytes = 0; // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!isSibCall) Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); RegsToPassVector RegsToPass; SmallVector MemOpChains; // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization, arguments are handled later. for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::AExt: Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg); break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } // f64 and v2f64 might be passed in i32 pairs and must be split into pieces if (VA.needsCustom()) { if (VA.getLocVT() == MVT::v2f64) { SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(0, dl, MVT::i32)); SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(1, dl, MVT::i32)); PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); VA = ArgLocs[++i]; // skip ahead to next loc if (VA.isRegLoc()) { PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); } else { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags)); } } else { PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i], StackPtr, MemOpChains, Flags); } } else if (VA.isRegLoc()) { if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i32) { assert(VA.getLocVT() == MVT::i32 && "unexpected calling convention register assignment"); assert(!Ins.empty() && Ins[0].VT == MVT::i32 && "unexpected use of 'returned'"); isThisReturn = true; } RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); } else if (isByVal) { assert(VA.isMemLoc()); unsigned offset = 0; // True if this byval aggregate will be split between registers // and memory. unsigned ByValArgsCount = CCInfo.getInRegsParamsCount(); unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed(); if (CurByValIdx < ByValArgsCount) { unsigned RegBegin, RegEnd; CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); unsigned int i, j; for (i = 0, j = RegBegin; j < RegEnd; i++, j++) { SDValue Const = DAG.getConstant(4*i, dl, MVT::i32); SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const); SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(), DAG.InferPtrAlignment(AddArg)); MemOpChains.push_back(Load.getValue(1)); RegsToPass.push_back(std::make_pair(j, Load)); } // If parameter size outsides register area, "offset" value // helps us to calculate stack slot for remained part properly. offset = RegEnd - RegBegin; CCInfo.nextInRegsParam(); } if (Flags.getByValSize() > 4*offset) { auto PtrVT = getPointerTy(DAG.getDataLayout()); unsigned LocMemOffset = VA.getLocMemOffset(); SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff); SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl); SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset); SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl, MVT::i32); SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl, MVT::i32); SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode}; MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops)); } } else if (!isSibCall) { assert(VA.isMemLoc()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InFlag; // Tail call byval lowering might overwrite argument registers so in case of // tail call optimization the copies to registers are lowered later. if (!isTailCall) for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } // For tail calls lower the arguments to the 'real' stack slot. if (isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and // the alias isn't otherwise explicit. This is slightly more conservative // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. // Do not flag preceding copytoreg stuff together with the following stuff. InFlag = SDValue(); for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } InFlag = SDValue(); } // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol // node so that legalize doesn't hack it. bool isDirect = false; const TargetMachine &TM = getTargetMachine(); const Module *Mod = MF.getFunction().getParent(); const GlobalValue *GV = nullptr; if (GlobalAddressSDNode *G = dyn_cast(Callee)) GV = G->getGlobal(); bool isStub = !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO(); bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass()); bool isLocalARMFunc = false; ARMFunctionInfo *AFI = MF.getInfo(); auto PtrVt = getPointerTy(DAG.getDataLayout()); if (Subtarget->genLongCalls()) { assert((!isPositionIndependent() || Subtarget->isTargetWindows()) && "long-calls codegen is not position independent!"); // Handle a global address or an external symbol. If it's not one of // those, the target's already in a register, so we don't need to do // anything extra. if (isa(Callee)) { // Create a constant pool entry for the callee address unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0); // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } else if (ExternalSymbolSDNode *S=dyn_cast(Callee)) { const char *Sym = S->getSymbol(); // Create a constant pool entry for the callee address unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 0); // Get the address of the callee into a register SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } } else if (isa(Callee)) { // If we're optimizing for minimum size and the function is called three or // more times in this block, we can improve codesize by calling indirectly // as BLXr has a 16-bit encoding. auto *GV = cast(Callee)->getGlobal(); auto *BB = CLI.CS.getParent(); bool PreferIndirect = Subtarget->isThumb() && MF.getFunction().optForMinSize() && count_if(GV->users(), [&BB](const User *U) { return isa(U) && cast(U)->getParent() == BB; }) > 2; if (!PreferIndirect) { isDirect = true; bool isDef = GV->isStrongDefinitionForLinker(); // ARM call to a local ARM function is predicable. isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking); // tBX takes a register source operand. if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?"); Callee = DAG.getNode( ARMISD::WrapperPIC, dl, PtrVt, DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY)); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(DAG.getMachineFunction()), /* Alignment = */ 0, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); } else if (Subtarget->isTargetCOFF()) { assert(Subtarget->isTargetWindows() && "Windows is the only supported COFF target"); unsigned TargetFlags = GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG; Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0, TargetFlags); if (GV->hasDLLImportStorageClass()) Callee = DAG.getLoad(PtrVt, dl, DAG.getEntryNode(), DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee), MachinePointerInfo::getGOT(DAG.getMachineFunction())); } else { Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0); } } } else if (ExternalSymbolSDNode *S = dyn_cast(Callee)) { isDirect = true; // tBX takes a register source operand. const char *Sym = S->getSymbol(); if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) { unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym, ARMPCLabelIndex, 4); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); Callee = DAG.getLoad( PtrVt, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel); } else { Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0); } } // FIXME: handle tail calls differently. unsigned CallOpc; if (Subtarget->isThumb()) { if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else CallOpc = ARMISD::CALL; } else { if (!isDirect && !Subtarget->hasV5TOps()) CallOpc = ARMISD::CALL_NOLINK; else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() && // Emit regular call when code size is the priority !MF.getFunction().optForMinSize()) // "mov lr, pc; b _foo" to avoid confusing the RSP CallOpc = ARMISD::CALL_NOLINK; else CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL; } std::vector Ops; Ops.push_back(Chain); Ops.push_back(Callee); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. if (!isTailCall) { const uint32_t *Mask; const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo(); if (isThisReturn) { // For 'this' returns, use the R0-preserving mask if applicable Mask = ARI->getThisReturnPreservedMask(MF, CallConv); if (!Mask) { // Set isThisReturn to false if the calling convention is not one that // allows 'returned' to be modeled in this way, so LowerCallResult does // not try to pass 'this' straight through isThisReturn = false; Mask = ARI->getCallPreservedMask(MF, CallConv); } } else Mask = ARI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); } if (InFlag.getNode()) Ops.push_back(InFlag); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); if (isTailCall) { MF.getFrameInfo().setHasTailCall(); return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops); } // Returns a chain and a flag for retval copy to use. Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops); InFlag = Chain.getValue(1); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), DAG.getIntPtrConstant(0, dl, true), InFlag, dl); if (!Ins.empty()) InFlag = Chain.getValue(1); // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals, isThisReturn, isThisReturn ? OutVals[0] : SDValue()); } /// HandleByVal - Every parameter *after* a byval parameter is passed /// on the stack. Remember the next parameter register to allocate, /// and then confiscate the rest of the parameter registers to insure /// this. void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size, unsigned Align) const { // Byval (as with any stack) slots are always at least 4 byte aligned. Align = std::max(Align, 4U); unsigned Reg = State->AllocateReg(GPRArgRegs); if (!Reg) return; unsigned AlignInRegs = Align / 4; unsigned Waste = (ARM::R4 - Reg) % AlignInRegs; for (unsigned i = 0; i < Waste; ++i) Reg = State->AllocateReg(GPRArgRegs); if (!Reg) return; unsigned Excess = 4 * (ARM::R4 - Reg); // Special case when NSAA != SP and parameter size greater than size of // all remained GPR regs. In that case we can't split parameter, we must // send it to stack. We also must set NCRN to R4, so waste all // remained registers. const unsigned NSAAOffset = State->getNextStackOffset(); if (NSAAOffset != 0 && Size > Excess) { while (State->AllocateReg(GPRArgRegs)) ; return; } // First register for byval parameter is the first register that wasn't // allocated before this method call, so it would be "reg". // If parameter is small enough to be saved in range [reg, r4), then // the end (first after last) register would be reg + param-size-in-regs, // else parameter would be splitted between registers and stack, // end register would be r4 in this case. unsigned ByValRegBegin = Reg; unsigned ByValRegEnd = std::min(Reg + Size / 4, ARM::R4); State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd); // Note, first register is allocated in the beginning of function already, // allocate remained amount of registers we need. for (unsigned i = Reg + 1; i != ByValRegEnd; ++i) State->AllocateReg(GPRArgRegs); // A byval parameter that is split between registers and memory needs its // size truncated here. // In the case where the entire structure fits in registers, we set the // size in memory to zero. Size = std::max(Size - Excess, 0); } /// MatchingStackOffset - Return true if the given stack call argument is /// already available in the same position (relatively) of the caller's /// incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, const TargetInstrInfo *TII) { unsigned Bytes = Arg.getValueSizeInBits() / 8; int FI = std::numeric_limits::max(); if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast(Arg.getOperand(1))->getReg(); if (!TargetRegisterInfo::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) return false; if (!Flags.isByVal()) { if (!TII->isLoadFromStackSlot(*Def, FI)) return false; } else { return false; } } else if (LoadSDNode *Ld = dyn_cast(Arg)) { if (Flags.isByVal()) // ByVal argument is passed in as a pointer but it's now being // dereferenced. e.g. // define @foo(%struct.X* %A) { // tail call @bar(%struct.X* byval %A) // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast(Ptr); if (!FINode) return false; FI = FINode->getIndex(); } else return false; assert(FI != std::numeric_limits::max()); if (!MFI.isFixedObjectIndex(FI)) return false; return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI); } /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. bool ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG& DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const Function &CallerF = MF.getFunction(); CallingConv::ID CallerCC = CallerF.getCallingConv(); assert(Subtarget->supportsTailCall()); // Tail calls to function pointers cannot be optimized for Thumb1 if the args // to the call take up r0-r3. The reason is that there are no legal registers // left to hold the pointer to the function to be called. if (Subtarget->isThumb1Only() && Outs.size() >= 4 && !isa(Callee.getNode())) return false; // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. // Exception-handling functions need a special set of instructions to indicate // a return to the hardware. Tail-calling another function would probably // break this. if (CallerF.hasFnAttribute("interrupt")) return false; // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; // Externally-defined functions with weak linkage should not be // tail-called on ARM when the OS does not support dynamic // pre-emption of symbols, as the AAELF spec requires normal calls // to undefined weak functions to be replaced with a NOP or jump to the // next instruction. The behaviour of branch instructions in this // situation (as used for tail calls) is implementation-defined, so we // cannot rely on the linker replacing the tail call with a return. if (GlobalAddressSDNode *G = dyn_cast(Callee)) { const GlobalValue *GV = G->getGlobal(); const Triple &TT = getTargetMachine().getTargetTriple(); if (GV->hasExternalWeakLinkage() && (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) return false; } // Check that the call results are passed in the same way. LLVMContext &C = *DAG.getContext(); if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, CCAssignFnForReturn(CalleeCC, isVarArg), CCAssignFnForReturn(CallerCC, isVarArg))) return false; // The callee has to preserve all registers the caller needs to preserve. const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); if (CalleeCC != CallerCC) { const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) return false; } // If Caller's vararg or byval argument has been split between registers and // stack, do not perform tail call, since part of the argument is in caller's // local frame. const ARMFunctionInfo *AFI_Caller = MF.getInfo(); if (AFI_Caller->getArgRegsSaveSize()) return false; // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector ArgLocs; CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); if (CCInfo.getNextStackOffset()) { // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo &MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; ++i, ++realArgIdx) { CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[realArgIdx]; ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (VA.needsCustom()) { // f64 and vector types are split into multiple registers or // register/stack-slot combinations. The types will not match // the registers; give up on memory f64 refs until we figure // out what to do about this. if (!VA.isRegLoc()) return false; if (!ArgLocs[++i].isRegLoc()) return false; if (RegVT == MVT::v2f64) { if (!ArgLocs[++i].isRegLoc()) return false; if (!ArgLocs[++i].isRegLoc()) return false; } } else if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, TII)) return false; } } } const MachineRegisterInfo &MRI = MF.getRegInfo(); if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) return false; } return true; } bool ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const { SmallVector RVLocs; CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); } static SDValue LowerInterruptReturn(SmallVectorImpl &RetOps, const SDLoc &DL, SelectionDAG &DAG) { const MachineFunction &MF = DAG.getMachineFunction(); const Function &F = MF.getFunction(); StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString(); // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset // version of the "preferred return address". These offsets affect the return // instruction if this is a return from PL1 without hypervisor extensions. // IRQ/FIQ: +4 "subs pc, lr, #4" // SWI: 0 "subs pc, lr, #0" // ABORT: +4 "subs pc, lr, #4" // UNDEF: +4/+2 "subs pc, lr, #0" // UNDEF varies depending on where the exception came from ARM or Thumb // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0. int64_t LROffset; if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" || IntKind == "ABORT") LROffset = 4; else if (IntKind == "SWI" || IntKind == "UNDEF") LROffset = 0; else report_fatal_error("Unsupported interrupt attribute. If present, value " "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF"); RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, DL, MVT::i32, false)); return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps); } SDValue ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const { // CCValAssign - represent the assignment of the return value to a location. SmallVector RVLocs; // CCState - Info about the registers and stack slots. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); // Analyze outgoing return values. CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); SDValue Flag; SmallVector RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) bool isLittleEndian = Subtarget->isLittle(); MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); AFI->setReturnRegsCount(RVLocs.size()); // Copy the result values into the output registers. for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); ++i, ++realRVLocIdx) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue Arg = OutVals[realRVLocIdx]; bool ReturnF16 = false; if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) { // Half-precision return values can be returned like this: // // t11 f16 = fadd ... // t12: i16 = bitcast t11 // t13: i32 = zero_extend t12 // t14: f32 = bitcast t13 <~~~~~~~ Arg // // to avoid code generation for bitcasts, we simply set Arg to the node // that produces the f16 value, t11 in this case. // if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) { SDValue ZE = Arg.getOperand(0); if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) { SDValue BC = ZE.getOperand(0); if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) { Arg = BC.getOperand(0); ReturnF16 = true; } } } } switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: if (!ReturnF16) Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg); break; } if (VA.needsCustom()) { if (VA.getLocVT() == MVT::v2f64) { // Extract the first half and return it in two registers. SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(0, dl, MVT::i32)); SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Half); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc // Extract the 2nd half and fall through to handle it as an f64 value. Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg, DAG.getConstant(1, dl, MVT::i32)); } // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is // available. SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Arg); Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(isLittleEndian ? 0 : 1), Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); VA = RVLocs[++i]; // skip ahead to next loc Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(isLittleEndian ? 1 : 0), Flag); } else Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag); // Guarantee that all emitted copies are // stuck together, avoiding something bad. Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), ReturnF16 ? MVT::f16 : VA.getLocVT())); } const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); if (I) { for (; *I; ++I) { if (ARM::GPRRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::i32)); else if (ARM::DPRRegClass.contains(*I)) RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64))); else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); } } // Update chain and glue. RetOps[0] = Chain; if (Flag.getNode()) RetOps.push_back(Flag); // CPUs which aren't M-class use a special sequence to return from // exceptions (roughly, any instruction setting pc and cpsr simultaneously, // though we use "subs pc, lr, #N"). // // M-class CPUs actually use a normal return sequence with a special // (hardware-provided) value in LR, so the normal code path works. if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") && !Subtarget->isMClass()) { if (Subtarget->isThumb1Only()) report_fatal_error("interrupt attribute is not supported in Thumb1"); return LowerInterruptReturn(RetOps, dl, DAG); } return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps); } bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() == ARMISD::VMOVRRD) { SDNode *VMov = Copy; // f64 returned in a pair of GPRs. SmallPtrSet Copies; for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != ISD::CopyToReg) return false; Copies.insert(*UI); } if (Copies.size() > 2) return false; for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end(); UI != UE; ++UI) { SDValue UseChain = UI->getOperand(0); if (Copies.count(UseChain.getNode())) // Second CopyToReg Copy = *UI; else { // We are at the top of this chain. // If the copy has a glue operand, we conservatively assume it // isn't safe to perform a tail call. if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue) return false; // First CopyToReg TCChain = UseChain; } } } else if (Copy->getOpcode() == ISD::BITCAST) { // f32 returned in a single GPR. if (!Copy->hasOneUse()) return false; Copy = *Copy->use_begin(); if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0)) return false; // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else { return false; } bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != ARMISD::RET_FLAG && UI->getOpcode() != ARMISD::INTRET_FLAG) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { if (!Subtarget->supportsTailCall()) return false; auto Attr = CI->getParent()->getParent()->getFnAttribute("disable-tail-calls"); if (!CI->isTailCall() || Attr.getValueAsString() == "true") return false; return true; } // Trying to write a 64 bit value so need to split into two 32 bit values first, // and pass the lower and high parts through. static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); SDValue WriteValue = Op->getOperand(2); // This function is only supposed to be called for i64 type argument. assert(WriteValue.getValueType() == MVT::i64 && "LowerWRITE_REGISTER called for non-i64 type argument."); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, DAG.getConstant(0, DL, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue, DAG.getConstant(1, DL, MVT::i32)); SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi }; return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops); } // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is // one of the above mentioned nodes. It has to be wrapped because otherwise // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only // be used to form addressing mode. These wrapped nodes will be selected // into MOVi. SDValue ARMTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = Op.getValueType(); // FIXME there is no actual debug info here SDLoc dl(Op); ConstantPoolSDNode *CP = cast(Op); SDValue Res; // When generating execute-only code Constant Pools must be promoted to the // global data section. It's a bit ugly that we can't share them across basic // blocks, but this way we guarantee that execute-only behaves correct with // position-independent addressing modes. if (Subtarget->genExecuteOnly()) { auto AFI = DAG.getMachineFunction().getInfo(); auto T = const_cast(CP->getType()); auto C = const_cast(CP->getConstVal()); auto M = const_cast(DAG.getMachineFunction(). getFunction().getParent()); auto GV = new GlobalVariable( *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C, Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" + Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" + Twine(AFI->createPICLabelUId()) ); SDValue GA = DAG.getTargetGlobalAddress(dyn_cast(GV), dl, PtrVT); return LowerGlobalAddress(GA, DAG); } if (CP->isMachineConstantPoolEntry()) Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlignment()); else Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment()); return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res); } unsigned ARMTargetLowering::getJumpTableEncoding() const { return MachineJumpTableInfo::EK_Inline; } SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = 0; SDLoc DL(Op); EVT PtrVT = getPointerTy(DAG.getDataLayout()); const BlockAddress *BA = cast(Op)->getBlockAddress(); SDValue CPAddr; bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI(); if (!IsPositionIndependent) { CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4); } else { unsigned PCAdj = Subtarget->isThumb() ? 4 : 8; ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex, ARMCP::CPBlockAddress, PCAdj); CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); } CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr); SDValue Result = DAG.getLoad( PtrVT, DL, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); if (!IsPositionIndependent) return Result; SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32); return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel); } /// Convert a TLS address reference into the correct sequence of loads /// and calls to compute the variable's address for Darwin, and return an /// SDValue containing the final node. /// Darwin only has one TLS scheme which must be capable of dealing with the /// fully general situation, in the worst case. This means: /// + "extern __thread" declaration. /// + Defined in a possibly unknown dynamic library. /// /// The general system is that each __thread variable has a [3 x i32] descriptor /// which contains information used by the runtime to calculate the address. The /// only part of this the compiler needs to know about is the first word, which /// contains a function pointer that must be called with the address of the /// entire descriptor in "r0". /// /// Since this descriptor may be in a different unit, in general access must /// proceed along the usual ARM rules. A common sequence to produce is: /// /// movw rT1, :lower16:_var$non_lazy_ptr /// movt rT1, :upper16:_var$non_lazy_ptr /// ldr r0, [rT1] /// ldr rT2, [r0] /// blx rT2 /// [...address now in r0...] SDValue ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin() && "This function expects a Darwin target"); SDLoc DL(Op); // First step is to get the address of the actua global symbol. This is where // the TLS descriptor lives. SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG); // The first entry in the descriptor is a function pointer that we must call // to obtain the address of the variable. SDValue Chain = DAG.getEntryNode(); SDValue FuncTLVGet = DAG.getLoad( MVT::i32, DL, Chain, DescAddr, MachinePointerInfo::getGOT(DAG.getMachineFunction()), /* Alignment = */ 4, MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); Chain = FuncTLVGet.getValue(1); MachineFunction &F = DAG.getMachineFunction(); MachineFrameInfo &MFI = F.getFrameInfo(); MFI.setAdjustsStack(true); // TLS calls preserve all registers except those that absolutely must be // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be // silly). auto TRI = getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo(); auto ARI = static_cast(TRI); const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction()); // Finally, we can make the call. This is just a degenerate version of a // normal AArch64 call node: r0 takes the address of the descriptor, and // returns the address of the variable in this thread. Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue()); Chain = DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32), DAG.getRegisterMask(Mask), Chain.getValue(1)); return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1)); } SDValue ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering"); SDValue Chain = DAG.getEntryNode(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); // Load the current TEB (thread environment block) SDValue Ops[] = {Chain, DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), DAG.getConstant(15, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(13, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(2, DL, MVT::i32)}; SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, DAG.getVTList(MVT::i32, MVT::Other), Ops); SDValue TEB = CurrentTEB.getValue(0); Chain = CurrentTEB.getValue(1); // Load the ThreadLocalStoragePointer from the TEB // A pointer to the TLS array is located at offset 0x2c from the TEB. SDValue TLSArray = DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL)); TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo()); // The pointer to the thread's TLS data area is at the TLS Index scaled by 4 // offset into the TLSArray. // Load the TLS index from the C runtime SDValue TLSIndex = DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG); TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex); TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo()); SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex, DAG.getConstant(2, DL, MVT::i32)); SDValue TLS = DAG.getLoad(PtrVT, DL, Chain, DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot), MachinePointerInfo()); // Get the offset of the start of the .tls section (section base) const auto *GA = cast(Op); auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL); SDValue Offset = DAG.getLoad( PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32, DAG.getTargetConstantPool(CPV, PtrVT, 4)), MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset); } // Lower ISD::GlobalTLSAddress using the "general dynamic" model SDValue ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const { SDLoc dl(GA); EVT PtrVT = getPointerTy(DAG.getDataLayout()); unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true); SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument); Argument = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), Argument, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); SDValue Chain = Argument.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel); // call __tls_get_addr. ArgListTy Args; ArgListEntry Entry; Entry.Node = Argument; Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext()); Args.push_back(Entry); // FIXME: is there useful debug info available here? TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( CallingConv::C, Type::getInt32Ty(*DAG.getContext()), DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args)); std::pair CallResult = LowerCallTo(CLI); return CallResult.first; } // Lower ISD::GlobalTLSAddress using the "initial exec" or // "local exec" model. SDValue ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG, TLSModel::Model model) const { const GlobalValue *GV = GA->getGlobal(); SDLoc dl(GA); SDValue Offset; SDValue Chain = DAG.getEntryNode(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Get the Thread Pointer SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); if (model == TLSModel::InitialExec) { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); // Initial exec model. unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex, ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF, true); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); Chain = Offset.getValue(1); SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } else { // local exec model assert(model == TLSModel::LocalExec); ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF); Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4); Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset); Offset = DAG.getLoad( PtrVT, dl, Chain, Offset, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } // The address of the thread local variable is the add of the thread // pointer with the offset of the variable. return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); } SDValue ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { GlobalAddressSDNode *GA = cast(Op); if (DAG.getTarget().useEmulatedTLS()) return LowerToTLSEmulatedModel(GA, DAG); if (Subtarget->isTargetDarwin()) return LowerGlobalTLSAddressDarwin(Op, DAG); if (Subtarget->isTargetWindows()) return LowerGlobalTLSAddressWindows(Op, DAG); // TODO: implement the "local dynamic" model assert(Subtarget->isTargetELF() && "Only ELF implemented here"); TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal()); switch (model) { case TLSModel::GeneralDynamic: case TLSModel::LocalDynamic: return LowerToTLSGeneralDynamicModel(GA, DAG); case TLSModel::InitialExec: case TLSModel::LocalExec: return LowerToTLSExecModels(GA, DAG, model); } llvm_unreachable("bogus TLS model"); } /// Return true if all users of V are within function F, looking through /// ConstantExprs. static bool allUsersAreInFunction(const Value *V, const Function *F) { SmallVector Worklist; for (auto *U : V->users()) Worklist.push_back(U); while (!Worklist.empty()) { auto *U = Worklist.pop_back_val(); if (isa(U)) { for (auto *UU : U->users()) Worklist.push_back(UU); continue; } auto *I = dyn_cast(U); if (!I || I->getParent()->getParent() != F) return false; } return true; } /// Return true if all users of V are within some (any) function, looking through /// ConstantExprs. In other words, are there any global constant users? static bool allUsersAreInFunctions(const Value *V) { SmallVector Worklist; for (auto *U : V->users()) Worklist.push_back(U); while (!Worklist.empty()) { auto *U = Worklist.pop_back_val(); if (isa(U)) { for (auto *UU : U->users()) Worklist.push_back(UU); continue; } if (!isa(U)) return false; } return true; } // Return true if T is an integer, float or an array/vector of either. static bool isSimpleType(Type *T) { if (T->isIntegerTy() || T->isFloatingPointTy()) return true; Type *SubT = nullptr; if (T->isArrayTy()) SubT = T->getArrayElementType(); else if (T->isVectorTy()) SubT = T->getVectorElementType(); else return false; return SubT->isIntegerTy() || SubT->isFloatingPointTy(); } static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl) { // If we're creating a pool entry for a constant global with unnamed address, // and the global is small enough, we can emit it inline into the constant pool // to save ourselves an indirection. // // This is a win if the constant is only used in one function (so it doesn't // need to be duplicated) or duplicating the constant wouldn't increase code // size (implying the constant is no larger than 4 bytes). const Function &F = DAG.getMachineFunction().getFunction(); // We rely on this decision to inline being idemopotent and unrelated to the // use-site. We know that if we inline a variable at one use site, we'll // inline it elsewhere too (and reuse the constant pool entry). Fast-isel // doesn't know about this optimization, so bail out if it's enabled else // we could decide to inline here (and thus never emit the GV) but require // the GV from fast-isel generated code. if (!EnableConstpoolPromotion || DAG.getMachineFunction().getTarget().Options.EnableFastISel) return SDValue(); auto *GVar = dyn_cast(GV); if (!GVar || !GVar->hasInitializer() || !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() || !GVar->hasLocalLinkage()) return SDValue(); // Ensure that we don't try and inline any type that contains pointers. If // we inline a value that contains relocations, we move the relocations from // .data to .text which is not ideal. auto *Init = GVar->getInitializer(); if (!isSimpleType(Init->getType())) return SDValue(); // The constant islands pass can only really deal with alignment requests // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote // any type wanting greater alignment requirements than 4 bytes. We also // can only promote constants that are multiples of 4 bytes in size or // are paddable to a multiple of 4. Currently we only try and pad constants // that are strings for simplicity. auto *CDAInit = dyn_cast(Init); unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType()); unsigned Align = GVar->getAlignment(); unsigned RequiredPadding = 4 - (Size % 4); bool PaddingPossible = RequiredPadding == 4 || (CDAInit && CDAInit->isString()); if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize || Size == 0) return SDValue(); unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding); MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); // We can't bloat the constant pool too much, else the ConstantIslands pass // may fail to converge. If we haven't promoted this global yet (it may have // multiple uses), and promoting it would increase the constant pool size (Sz // > 4), ensure we have space to do so up to MaxTotal. if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4) if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >= ConstpoolPromotionMaxTotal) return SDValue(); // This is only valid if all users are in a single function OR it has users // in multiple functions but it no larger than a pointer. We also check if // GVar has constant (non-ConstantExpr) users. If so, it essentially has its // address taken. if (!allUsersAreInFunction(GVar, &F) && !(Size <= 4 && allUsersAreInFunctions(GVar))) return SDValue(); // We're going to inline this global. Pad it out if needed. if (RequiredPadding != 4) { StringRef S = CDAInit->getAsString(); SmallVector V(S.size()); std::copy(S.bytes_begin(), S.bytes_end(), V.begin()); while (RequiredPadding--) V.push_back(0); Init = ConstantDataArray::get(*DAG.getContext(), V); } auto CPVal = ARMConstantPoolConstant::Create(GVar, Init); SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4); if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) { AFI->markGlobalAsPromotedToConstantPool(GVar); AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() + PaddedSize - 4); } ++NumConstpoolPromoted; return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); } bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const { if (const GlobalAlias *GA = dyn_cast(GV)) GV = GA->getBaseObject(); return (isa(GV) && cast(GV)->isConstant()) || isa(GV); } SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { switch (Subtarget->getTargetTriple().getObjectFormat()) { default: llvm_unreachable("unknown object format"); case Triple::COFF: return LowerGlobalAddressWindows(Op, DAG); case Triple::ELF: return LowerGlobalAddressELF(Op, DAG); case Triple::MachO: return LowerGlobalAddressDarwin(Op, DAG); } } SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const { EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); const TargetMachine &TM = getTargetMachine(); bool IsRO = isReadOnly(GV); // promoteToConstantPool only if not generating XO text section if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly()) if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl)) return V; if (isPositionIndependent()) { bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV); SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, UseGOT_PREL ? ARMII::MO_GOT : 0); SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); if (UseGOT_PREL) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } else if (Subtarget->isROPI() && IsRO) { // PC-relative. SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT); SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G); return Result; } else if (Subtarget->isRWPI() && !IsRO) { // SB-relative. SDValue RelAddr; if (Subtarget->useMovt(DAG.getMachineFunction())) { ++NumMovwMovt; SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL); RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G); } else { // use literal pool for address constant ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, ARMCP::SBREL); SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); RelAddr = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT); SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr); return Result; } // If we have T2 ops, we can materialize the address directly via movt/movw // pair. This is always cheaper. if (Subtarget->useMovt(DAG.getMachineFunction())) { ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. return DAG.getNode(ARMISD::Wrapper, dl, PtrVT, DAG.getTargetGlobalAddress(GV, dl, PtrVT)); } else { SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); return DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); } } SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const { assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && "ROPI/RWPI not currently supported for Darwin"); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc dl(Op); const GlobalValue *GV = cast(Op)->getGlobal(); if (Subtarget->useMovt(DAG.getMachineFunction())) ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into multiple nodes unsigned Wrapper = isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper; SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY); SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G); if (Subtarget->isGVIndirectSymbol(GV)) Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported"); assert(Subtarget->useMovt(DAG.getMachineFunction()) && "Windows on ARM expects to use movw/movt"); assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && "ROPI/RWPI not currently supported for Windows"); const GlobalValue *GV = cast(Op)->getGlobal(); const ARMII::TOF TargetFlags = (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue Result; SDLoc DL(Op); ++NumMovwMovt; // FIXME: Once remat is capable of dealing with instructions with register // operands, expand this into two nodes. Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0, TargetFlags)); if (GV->hasDLLImportStorageClass()) Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result, MachinePointerInfo::getGOT(DAG.getMachineFunction())); return Result; } SDValue ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); SDValue Val = DAG.getConstant(0, dl, MVT::i32); return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0), Op.getOperand(1), Val); } SDValue ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0), Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32)); } SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other, Op.getOperand(0)); } SDValue ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const { unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); SDLoc dl(Op); switch (IntNo) { default: return SDValue(); // Don't custom lower most intrinsics. case Intrinsic::thread_pointer: { EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT); } case Intrinsic::eh_sjlj_lsda: { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned ARMPCLabelIndex = AFI->createPICLabelUId(); EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDValue CPAddr; bool IsPositionIndependent = isPositionIndependent(); unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0; ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex, ARMCP::CPLSDA, PCAdj); CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4); CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr); SDValue Result = DAG.getLoad( PtrVT, dl, DAG.getEntryNode(), CPAddr, MachinePointerInfo::getConstantPool(DAG.getMachineFunction())); if (IsPositionIndependent) { SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32); Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel); } return Result; } case Intrinsic::arm_neon_vabs: return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(), Op.getOperand(1)); case Intrinsic::arm_neon_vmulls: case Intrinsic::arm_neon_vmullu: { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) ? ARMISD::VMULLs : ARMISD::VMULLu; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vminnm: case Intrinsic::arm_neon_vmaxnm: { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm) ? ISD::FMINNUM : ISD::FMAXNUM; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vminu: case Intrinsic::arm_neon_vmaxu: { if (Op.getValueType().isFloatingPoint()) return SDValue(); unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu) ? ISD::UMIN : ISD::UMAX; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vmins: case Intrinsic::arm_neon_vmaxs: { // v{min,max}s is overloaded between signed integers and floats. if (!Op.getValueType().isFloatingPoint()) { unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) ? ISD::SMIN : ISD::SMAX; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins) ? ISD::FMINNAN : ISD::FMAXNAN; return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); } case Intrinsic::arm_neon_vtbl1: return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2)); case Intrinsic::arm_neon_vtbl2: return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); } } static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { SDLoc dl(Op); ConstantSDNode *SSIDNode = cast(Op.getOperand(2)); auto SSID = static_cast(SSIDNode->getZExtValue()); if (SSID == SyncScope::SingleThread) return Op; if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() && "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!"); return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0), DAG.getConstant(0, dl, MVT::i32)); } ConstantSDNode *OrdN = cast(Op.getOperand(1)); AtomicOrdering Ord = static_cast(OrdN->getZExtValue()); ARM_MB::MemBOpt Domain = ARM_MB::ISH; if (Subtarget->isMClass()) { // Only a full system barrier exists in the M-class architectures. Domain = ARM_MB::SY; } else if (Subtarget->preferISHSTBarriers() && Ord == AtomicOrdering::Release) { // Swift happens to implement ISHST barriers in a way that's compatible with // Release semantics but weaker than ISH so we'd be fools not to use // it. Beware: other processors probably don't! Domain = ARM_MB::ISHST; } return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0), DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32), DAG.getConstant(Domain, dl, MVT::i32)); } static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { // ARM pre v5TE and Thumb1 does not have preload instructions. if (!(Subtarget->isThumb2() || (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps()))) // Just preserve the chain. return Op.getOperand(0); SDLoc dl(Op); unsigned isRead = ~cast(Op.getOperand(2))->getZExtValue() & 1; if (!isRead && (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension())) // ARMv7 with MP extension has PLDW. return Op.getOperand(0); unsigned isData = cast(Op.getOperand(4))->getZExtValue(); if (Subtarget->isThumb()) { // Invert the bits. isRead = ~isRead & 1; isData = ~isData & 1; } return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0), Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32), DAG.getConstant(isData, dl, MVT::i32)); } static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *FuncInfo = MF.getInfo(); // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDLoc dl(Op); EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); const Value *SV = cast(Op.getOperand(2))->getValue(); return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), MachinePointerInfo(SV)); } SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, const SDLoc &dl) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); const TargetRegisterClass *RC; if (AFI->isThumb1OnlyFunction()) RC = &ARM::tGPRRegClass; else RC = &ARM::GPRRegClass; // Transform the arguments stored in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); SDValue ArgValue2; if (NextVA.isMemLoc()) { MachineFrameInfo &MFI = MF.getFrameInfo(); int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true); // Create load node to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); ArgValue2 = DAG.getLoad( MVT::i32, dl, Root, FIN, MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); } else { Reg = MF.addLiveIn(NextVA.getLocReg(), RC); ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32); } if (!Subtarget->isLittle()) std::swap (ArgValue, ArgValue2); return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2); } // The remaining GPRs hold either the beginning of variable-argument // data, or the beginning of an aggregate passed by value (usually // byval). Either way, we allocate stack slots adjacent to the data // provided by our caller, and store the unallocated registers there. // If this is a variadic function, the va_list pointer will begin with // these values; otherwise, this reassembles a (byval) structure that // was split between registers and memory. // Return: The frame index registers were stored into. int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, const Value *OrigArg, unsigned InRegsParamRecordIdx, int ArgOffset, unsigned ArgSize) const { // Currently, two use-cases possible: // Case #1. Non-var-args function, and we meet first byval parameter. // Setup first unallocated register as first byval register; // eat all remained registers // (these two actions are performed by HandleByVal method). // Then, here, we initialize stack frame with // "store-reg" instructions. // Case #2. Var-args function, that doesn't contain byval parameters. // The same: eat all remained unallocated registers, // initialize stack frame. MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); unsigned RBegin, REnd; if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) { CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd); } else { unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs); RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx]; REnd = ARM::R4; } if (REnd != RBegin) ArgOffset = -4 * (ARM::R4 - RBegin); auto PtrVT = getPointerTy(DAG.getDataLayout()); int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false); SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT); SmallVector MemOps; const TargetRegisterClass *RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) { unsigned VReg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo(OrigArg, 4 * i)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); return FrameIndex; } // Setup stack frame, the va_list pointer will start from. void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, unsigned ArgOffset, unsigned TotalArgRegsSaveSize, bool ForceMutable) const { MachineFunction &MF = DAG.getMachineFunction(); ARMFunctionInfo *AFI = MF.getInfo(); // Try to store any remaining integer argument regs // to their spots on the stack so that they may be loaded by dereferencing // the result of va_next. // If there is no regs to be stored, just point address after last // argument passed via stack. int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(), CCInfo.getNextStackOffset(), 4); AFI->setVarArgsFrameIndex(FrameIndex); } SDValue ARMTargetLowering::LowerFormalArguments( SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); ARMFunctionInfo *AFI = MF.getInfo(); // Assign locations to all of the incoming arguments. SmallVector ArgLocs; CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg)); SmallVector ArgValues; SDValue ArgValue; Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin(); unsigned CurArgIdx = 0; // Initially ArgRegsSaveSize is zero. // Then we increase this value each time we meet byval parameter. // We also increase this value in case of varargs function. AFI->setArgRegsSaveSize(0); // Calculate the amount of stack space that we need to allocate to store // byval and variadic arguments that are passed in registers. // We need to know this before we allocate the first byval or variadic // argument, as they will be allocated a stack slot below the CFA (Canonical // Frame Address, the stack pointer at entry to the function). unsigned ArgRegBegin = ARM::R4; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount()) break; CCValAssign &VA = ArgLocs[i]; unsigned Index = VA.getValNo(); ISD::ArgFlagsTy Flags = Ins[Index].Flags; if (!Flags.isByVal()) continue; assert(VA.isMemLoc() && "unexpected byval pointer in reg"); unsigned RBegin, REnd; CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd); ArgRegBegin = std::min(ArgRegBegin, RBegin); CCInfo.nextInRegsParam(); } CCInfo.rewindByValRegsInfo(); int lastInsIndex = -1; if (isVarArg && MFI.hasVAStart()) { unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs); if (RegIdx != array_lengthof(GPRArgRegs)) ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]); } unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin); AFI->setArgRegsSaveSize(TotalArgRegsSaveSize); auto PtrVT = getPointerTy(DAG.getDataLayout()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (Ins[VA.getValNo()].isOrigArg()) { std::advance(CurOrigArg, Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx); CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex(); } // Arguments stored in registers. if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); if (VA.needsCustom()) { // f64 and vector types are split up into multiple registers or // combinations of registers and stack slots. if (VA.getLocVT() == MVT::v2f64) { SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); VA = ArgLocs[++i]; // skip ahead to next loc SDValue ArgValue2; if (VA.isMemLoc()) { int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true); SDValue FIN = DAG.getFrameIndex(FI, PtrVT); ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FI)); } else { ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, ArgValue1, DAG.getIntPtrConstant(0, dl)); ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue, ArgValue2, DAG.getIntPtrConstant(1, dl)); } else ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl); } else { const TargetRegisterClass *RC; if (RegVT == MVT::f16) RC = &ARM::HPRRegClass; else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) RC = &ARM::DPRRegClass; else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass; else llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); // Transform the arguments in physical registers into virtual ones. unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); } // If this is an 8 or 16-bit value, it is really passed promoted // to 32 bits. Insert an assert[sz]ext to capture this, then // truncate to the right size. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::BCvt: ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); break; case CCValAssign::SExt: ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); break; case CCValAssign::ZExt: ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); break; } InVals.push_back(ArgValue); } else { // VA.isRegLoc() // sanity check assert(VA.isMemLoc()); assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered"); int index = VA.getValNo(); // Some Ins[] entries become multiple ArgLoc[] entries. // Process them only once. if (index != lastInsIndex) { ISD::ArgFlagsTy Flags = Ins[index].Flags; // FIXME: For now, all byval parameter objects are marked mutable. // This can be changed with more analysis. // In case of tail call optimization mark all arguments mutable. // Since they could be overwritten by lowering of arguments in case of // a tail call. if (Flags.isByVal()) { assert(Ins[index].isOrigArg() && "Byval arguments cannot be implicit"); unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed(); int FrameIndex = StoreByValRegs( CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex, VA.getLocMemOffset(), Flags.getByValSize()); InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT)); CCInfo.nextInRegsParam(); } else { unsigned FIOffset = VA.getLocMemOffset(); int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8, FIOffset, true); // Create load nodes to retrieve arguments from the stack. SDValue FIN = DAG.getFrameIndex(FI, PtrVT); InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, MachinePointerInfo::getFixedStack( DAG.getMachineFunction(), FI))); } lastInsIndex = index; } } } // varargs if (isVarArg && MFI.hasVAStart()) VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset(), TotalArgRegsSaveSize); AFI->setArgumentStackSize(CCInfo.getNextStackOffset()); return Chain; } /// isFloatingPointZero - Return true if this is +0.0. static bool isFloatingPointZero(SDValue Op) { if (ConstantFPSDNode *CFP = dyn_cast(Op)) return CFP->getValueAPF().isPosZero(); else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) { // Maybe this has already been legalized into the constant pool? if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) { SDValue WrapperOp = Op.getOperand(1).getOperand(0); if (ConstantPoolSDNode *CP = dyn_cast(WrapperOp)) if (const ConstantFP *CFP = dyn_cast(CP->getConstVal())) return CFP->getValueAPF().isPosZero(); } } else if (Op->getOpcode() == ISD::BITCAST && Op->getValueType(0) == MVT::f64) { // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64) // created by LowerConstantFP(). SDValue BitcastOp = Op->getOperand(0); if (BitcastOp->getOpcode() == ARMISD::VMOVIMM && isNullConstant(BitcastOp->getOperand(0))) return true; } return false; } /// Returns appropriate ARM CMP (cmp) and corresponding condition code for /// the given operands. SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const { if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); if (!isLegalICmpImmediate((int32_t)C)) { // Constant does not fit, try adjusting it by one. switch (CC) { default: break; case ISD::SETLT: case ISD::SETGE: if (C != 0x80000000 && isLegalICmpImmediate(C-1)) { CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; RHS = DAG.getConstant(C - 1, dl, MVT::i32); } break; case ISD::SETULT: case ISD::SETUGE: if (C != 0 && isLegalICmpImmediate(C-1)) { CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; RHS = DAG.getConstant(C - 1, dl, MVT::i32); } break; case ISD::SETLE: case ISD::SETGT: if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) { CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; RHS = DAG.getConstant(C + 1, dl, MVT::i32); } break; case ISD::SETULE: case ISD::SETUGT: if (C != 0xffffffff && isLegalICmpImmediate(C+1)) { CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; RHS = DAG.getConstant(C + 1, dl, MVT::i32); } break; } } } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) && (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) { // In ARM and Thumb-2, the compare instructions can shift their second // operand. CC = ISD::getSetCCSwappedOperands(CC); std::swap(LHS, RHS); } ARMCC::CondCodes CondCode = IntCCToARMCC(CC); ARMISD::NodeType CompareType; switch (CondCode) { default: CompareType = ARMISD::CMP; break; case ARMCC::EQ: case ARMCC::NE: // Uses only Z Flag CompareType = ARMISD::CMPZ; break; } ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS); } /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands. SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl, bool InvalidOnQNaN) const { assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64); SDValue Cmp; SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32); if (!isFloatingPointZero(RHS)) Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C); else Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C); return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp); } /// duplicateCmp - Glue values can have only one use, so this function /// duplicates a comparison node. SDValue ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const { unsigned Opc = Cmp.getOpcode(); SDLoc DL(Cmp); if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ) return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1)); assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation"); Cmp = Cmp.getOperand(0); Opc = Cmp.getOpcode(); if (Opc == ARMISD::CMPFP) Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), Cmp.getOperand(1), Cmp.getOperand(2)); else { assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT"); Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0), Cmp.getOperand(1)); } return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp); } // This function returns three things: the arithmetic computation itself // (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The // comparison and the condition code define the case in which the arithmetic // computation *does not* overflow. std::pair ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const { assert(Op.getValueType() == MVT::i32 && "Unsupported value type"); SDValue Value, OverflowCmp; SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDLoc dl(Op); // FIXME: We are currently always generating CMPs because we don't support // generating CMN through the backend. This is not as good as the natural // CMP case because it causes a register dependency and cannot be folded // later. switch (Op.getOpcode()) { default: llvm_unreachable("Unknown overflow instruction!"); case ISD::SADDO: ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); break; case ISD::UADDO: ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); // We use ADDC here to correspond to its use in LowerUnsignedALUO. // We do not use it in the USUBO case as Value may not be used. Value = DAG.getNode(ARMISD::ADDC, dl, DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS) .getValue(0); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS); break; case ISD::SSUBO: ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32); Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); break; case ISD::USUBO: ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32); Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS); break; case ISD::UMULO: // We generate a UMUL_LOHI and then check if the high word is 0. ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); Value = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(Op.getValueType(), Op.getValueType()), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), DAG.getConstant(0, dl, MVT::i32)); Value = Value.getValue(0); // We only want the low 32 bits for the result. break; case ISD::SMULO: // We generate a SMUL_LOHI and then check if all the bits of the high word // are the same as the sign bit of the low word. ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32); Value = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(Op.getValueType(), Op.getValueType()), LHS, RHS); OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1), DAG.getNode(ISD::SRA, dl, Op.getValueType(), Value.getValue(0), DAG.getConstant(31, dl, MVT::i32))); Value = Value.getValue(0); // We only want the low 32 bits for the result. break; } // switch (...) return std::make_pair(Value, OverflowCmp); } SDValue ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const { // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) return SDValue(); SDValue Value, OverflowCmp; SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDLoc dl(Op); // We use 0 and 1 as false and true values. SDValue TVal = DAG.getConstant(1, dl, MVT::i32); SDValue FVal = DAG.getConstant(0, dl, MVT::i32); EVT VT = Op.getValueType(); SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal, ARMcc, CCR, OverflowCmp); SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry, SelectionDAG &DAG) { SDLoc DL(BoolCarry); EVT CarryVT = BoolCarry.getValueType(); // This converts the boolean value carry into the carry flag by doing // ARMISD::SUBC Carry, 1 SDValue Carry = DAG.getNode(ARMISD::SUBC, DL, DAG.getVTList(CarryVT, MVT::i32), BoolCarry, DAG.getConstant(1, DL, CarryVT)); return Carry.getValue(1); } static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT, SelectionDAG &DAG) { SDLoc DL(Flags); // Now convert the carry flag into a boolean carry. We do this // using ARMISD:ADDE 0, 0, Carry return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), Flags); } SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const { // Let legalize expand this if it isn't a legal type yet. if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) return SDValue(); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); SDVTList VTs = DAG.getVTList(VT, MVT::i32); SDValue Value; SDValue Overflow; switch (Op.getOpcode()) { default: llvm_unreachable("Unknown overflow instruction!"); case ISD::UADDO: Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS); // Convert the carry flag into a boolean value. Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); break; case ISD::USUBO: { Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS); // Convert the carry flag into a boolean value. Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG); // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow // value. So compute 1 - C. Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(1, dl, MVT::i32), Overflow); break; } } return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow); } SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDValue Cond = Op.getOperand(0); SDValue SelectTrue = Op.getOperand(1); SDValue SelectFalse = Op.getOperand(2); SDLoc dl(Op); unsigned Opc = Cond.getOpcode(); if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO)) { if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) return SDValue(); SDValue Value, OverflowCmp; SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); EVT VT = Op.getValueType(); return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR, OverflowCmp, DAG); } // Convert: // // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond) // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond) // if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) { const ConstantSDNode *CMOVTrue = dyn_cast(Cond.getOperand(0)); const ConstantSDNode *CMOVFalse = dyn_cast(Cond.getOperand(1)); if (CMOVTrue && CMOVFalse) { unsigned CMOVTrueVal = CMOVTrue->getZExtValue(); unsigned CMOVFalseVal = CMOVFalse->getZExtValue(); SDValue True; SDValue False; if (CMOVTrueVal == 1 && CMOVFalseVal == 0) { True = SelectTrue; False = SelectFalse; } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) { True = SelectFalse; False = SelectTrue; } if (True.getNode() && False.getNode()) { EVT VT = Op.getValueType(); SDValue ARMcc = Cond.getOperand(2); SDValue CCR = Cond.getOperand(3); SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG); assert(True.getValueType() == VT); return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG); } } } // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the // undefined bits before doing a full-word comparison with zero. Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond, DAG.getConstant(1, dl, Cond.getValueType())); return DAG.getSelectCC(dl, Cond, DAG.getConstant(0, dl, Cond.getValueType()), SelectTrue, SelectFalse, ISD::SETNE); } static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps) { // Start by selecting the GE condition code for opcodes that return true for // 'equality' if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE || CC == ISD::SETULE) CondCode = ARMCC::GE; // and GT for opcodes that return false for 'equality'. else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT || CC == ISD::SETULT) CondCode = ARMCC::GT; // Since we are constrained to GE/GT, if the opcode contains 'less', we need // to swap the compare operands. if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT || CC == ISD::SETULT) swpCmpOps = true; // Both GT and GE are ordered comparisons, and return false for 'unordered'. // If we have an unordered opcode, we need to swap the operands to the VSEL // instruction (effectively negating the condition). // // This also has the effect of swapping which one of 'less' or 'greater' // returns true, so we also swap the compare operands. It also switches // whether we return true for 'equality', so we compensate by picking the // opposite condition code to our original choice. if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE || CC == ISD::SETUGT) { swpCmpOps = !swpCmpOps; swpVselOps = !swpVselOps; CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT; } // 'ordered' is 'anything but unordered', so use the VS condition code and // swap the VSEL operands. if (CC == ISD::SETO) { CondCode = ARMCC::VS; swpVselOps = true; } // 'unordered or not equal' is 'anything but equal', so use the EQ condition // code and swap the VSEL operands. if (CC == ISD::SETUNE) { CondCode = ARMCC::EQ; swpVselOps = true; } } SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const { if (Subtarget->isFPOnlySP() && VT == MVT::f64) { FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), FalseVal); TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), TrueVal); SDValue TrueLow = TrueVal.getValue(0); SDValue TrueHigh = TrueVal.getValue(1); SDValue FalseLow = FalseVal.getValue(0); SDValue FalseHigh = FalseVal.getValue(1); SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow, ARMcc, CCR, Cmp); SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh, ARMcc, CCR, duplicateCmp(Cmp, DAG)); return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High); } else { return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp); } } static bool isGTorGE(ISD::CondCode CC) { return CC == ISD::SETGT || CC == ISD::SETGE; } static bool isLTorLE(ISD::CondCode CC) { return CC == ISD::SETLT || CC == ISD::SETLE; } // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating. // All of these conditions (and their <= and >= counterparts) will do: // x < k ? k : x // x > k ? x : k // k < x ? x : k // k > x ? k : x static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K) { return (isGTorGE(CC) && ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) || (isLTorLE(CC) && ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))); } // Similar to isLowerSaturate(), but checks for upper-saturating conditions. static bool isUpperSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K) { return (isGTorGE(CC) && ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) || (isLTorLE(CC) && ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))); } // Check if two chained conditionals could be converted into SSAT or USAT. // // SSAT can replace a set of two conditional selectors that bound a number to an // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples: // // x < -k ? -k : (x > k ? k : x) // x < -k ? -k : (x < k ? x : k) // x > -k ? (x > k ? k : x) : -k // x < k ? (x < -k ? -k : x) : k // etc. // // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is // a power of 2. // // It returns true if the conversion can be done, false otherwise. // Additionally, the variable is returned in parameter V, the constant in K and // usat is set to true if the conditional represents an unsigned saturation static bool isSaturatingConditional(const SDValue &Op, SDValue &V, uint64_t &K, bool &usat) { SDValue LHS1 = Op.getOperand(0); SDValue RHS1 = Op.getOperand(1); SDValue TrueVal1 = Op.getOperand(2); SDValue FalseVal1 = Op.getOperand(3); ISD::CondCode CC1 = cast(Op.getOperand(4))->get(); const SDValue Op2 = isa(TrueVal1) ? FalseVal1 : TrueVal1; if (Op2.getOpcode() != ISD::SELECT_CC) return false; SDValue LHS2 = Op2.getOperand(0); SDValue RHS2 = Op2.getOperand(1); SDValue TrueVal2 = Op2.getOperand(2); SDValue FalseVal2 = Op2.getOperand(3); ISD::CondCode CC2 = cast(Op2.getOperand(4))->get(); // Find out which are the constants and which are the variables // in each conditional SDValue *K1 = isa(LHS1) ? &LHS1 : isa(RHS1) ? &RHS1 : nullptr; SDValue *K2 = isa(LHS2) ? &LHS2 : isa(RHS2) ? &RHS2 : nullptr; SDValue K2Tmp = isa(TrueVal2) ? TrueVal2 : FalseVal2; SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1; SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2; SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2; // We must detect cases where the original operations worked with 16- or // 8-bit values. In such case, V2Tmp != V2 because the comparison operations // must work with sign-extended values but the select operations return // the original non-extended value. SDValue V2TmpReg = V2Tmp; if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG) V2TmpReg = V2Tmp->getOperand(0); // Check that the registers and the constants have the correct values // in both conditionals if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp || V2TmpReg != V2) return false; // Figure out which conditional is saturating the lower/upper bound. const SDValue *LowerCheckOp = isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) ? &Op : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 : nullptr; const SDValue *UpperCheckOp = isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1) ? &Op : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2) ? &Op2 : nullptr; if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp) return false; // Check that the constant in the lower-bound check is // the opposite of the constant in the upper-bound check // in 1's complement. int64_t Val1 = cast(*K1)->getSExtValue(); int64_t Val2 = cast(*K2)->getSExtValue(); int64_t PosVal = std::max(Val1, Val2); int64_t NegVal = std::min(Val1, Val2); if (((Val1 > Val2 && UpperCheckOp == &Op) || (Val1 < Val2 && UpperCheckOp == &Op2)) && isPowerOf2_64(PosVal + 1)) { // Handle the difference between USAT (unsigned) and SSAT (signed) saturation if (Val1 == ~Val2) usat = false; else if (NegVal == 0) usat = true; else return false; V = V2; K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive return true; } return false; } // Check if a condition of the type x < k ? k : x can be converted into a // bit operation instead of conditional moves. // Currently this is allowed given: // - The conditions and values match up // - k is 0 or -1 (all ones) // This function will not check the last condition, thats up to the caller // It returns true if the transformation can be made, and in such case // returns x in V, and k in SatK. static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK) { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); SDValue *K = isa(LHS) ? &LHS : isa(RHS) ? &RHS : nullptr; // No constant operation in comparison, early out if (!K) return false; SDValue KTmp = isa(TrueVal) ? TrueVal : FalseVal; V = (KTmp == TrueVal) ? FalseVal : TrueVal; SDValue VTmp = (K && *K == LHS) ? RHS : LHS; // If the constant on left and right side, or variable on left and right, // does not match, early out if (*K != KTmp || V != VTmp) return false; if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) { SatK = *K; return true; } return false; } SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); SDLoc dl(Op); // Try to convert two saturating conditional selects into a single SSAT SDValue SatValue; uint64_t SatConstant; bool SatUSat; if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) && isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) { if (SatUSat) return DAG.getNode(ARMISD::USAT, dl, VT, SatValue, DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); else return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue, DAG.getConstant(countTrailingOnes(SatConstant), dl, VT)); } // Try to convert expressions of the form x < k ? k : x (and similar forms) // into more efficient bit operations, which is possible when k is 0 or -1 // On ARM and Thumb-2 which have flexible operand 2 this will result in // single instructions. On Thumb the shift and the bit operation will be two // instructions. // Only allow this transformation on full-width (32-bit) operations SDValue LowerSatConstant; if (VT == MVT::i32 && isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) { SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue, DAG.getConstant(31, dl, VT)); if (isNullConstant(LowerSatConstant)) { SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV, DAG.getAllOnesConstant(dl, VT)); return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV); } else if (isAllOnesConstant(LowerSatConstant)) return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV); } SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); ISD::CondCode CC = cast(Op.getOperand(4))->get(); SDValue TrueVal = Op.getOperand(2); SDValue FalseVal = Op.getOperand(3); if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, dl); // If softenSetCCOperands only returned one value, we should compare it to // zero. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } if (LHS.getValueType() == MVT::i32) { // Try to generate VSEL on ARMv8. // The VSEL instruction can't use all the usual ARM condition // codes: it only has two bits to select the condition code, so it's // constrained to use only GE, GT, VS and EQ. // // To implement all the various ISD::SETXXX opcodes, we sometimes need to // swap the operands of the previous compare instruction (effectively // inverting the compare condition, swapping 'less' and 'greater') and // sometimes need to swap the operands to the VSEL (which inverts the // condition in the sense of firing whenever the previous condition didn't) if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 || TrueVal.getValueType() == MVT::f64)) { ARMCC::CondCodes CondCode = IntCCToARMCC(CC); if (CondCode == ARMCC::LT || CondCode == ARMCC::LE || CondCode == ARMCC::VC || CondCode == ARMCC::NE) { CC = ISD::getSetCCInverse(CC, true); std::swap(TrueVal, FalseVal); } } SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); } ARMCC::CondCodes CondCode, CondCode2; bool InvalidOnQNaN; FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); // Normalize the fp compare. If RHS is zero we keep it there so we match // CMPFPw0 instead of CMPFP. if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) && (TrueVal.getValueType() == MVT::f16 || TrueVal.getValueType() == MVT::f32 || TrueVal.getValueType() == MVT::f64)) { bool swpCmpOps = false; bool swpVselOps = false; checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps); if (CondCode == ARMCC::GT || CondCode == ARMCC::GE || CondCode == ARMCC::VS || CondCode == ARMCC::EQ) { if (swpCmpOps) std::swap(LHS, RHS); if (swpVselOps) std::swap(TrueVal, FalseVal); } } SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG); if (CondCode2 != ARMCC::AL) { SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32); // FIXME: Needs another CMP because flag can have but one use. SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG); } return Result; } /// canChangeToInt - Given the fp compare operand, return true if it is suitable /// to morph to an integer compare sequence. static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget) { SDNode *N = Op.getNode(); if (!N->hasOneUse()) // Otherwise it requires moving the value from fp to integer registers. return false; if (!N->getNumValues()) return false; EVT VT = Op.getValueType(); if (VT != MVT::f32 && !Subtarget->isFPBrccSlow()) // f32 case is generally profitable. f64 case only makes sense when vcmpe + // vmrs are very slow, e.g. cortex-a8. return false; if (isFloatingPointZero(Op)) { SeenZero = true; return true; } return ISD::isNormalLoad(N); } static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) { if (isFloatingPointZero(Op)) return DAG.getConstant(0, SDLoc(Op), MVT::i32); if (LoadSDNode *Ld = dyn_cast(Op)) return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); llvm_unreachable("Unknown VFP cmp argument!"); } static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2) { SDLoc dl(Op); if (isFloatingPointZero(Op)) { RetVal1 = DAG.getConstant(0, dl, MVT::i32); RetVal2 = DAG.getConstant(0, dl, MVT::i32); return; } if (LoadSDNode *Ld = dyn_cast(Op)) { SDValue Ptr = Ld->getBasePtr(); RetVal1 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->getAlignment(), Ld->getMemOperand()->getFlags()); EVT PtrType = Ptr.getValueType(); unsigned NewAlign = MinAlign(Ld->getAlignment(), 4); SDValue NewPtr = DAG.getNode(ISD::ADD, dl, PtrType, Ptr, DAG.getConstant(4, dl, PtrType)); RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr, Ld->getPointerInfo().getWithOffset(4), NewAlign, Ld->getMemOperand()->getFlags()); return; } llvm_unreachable("Unknown VFP cmp argument!"); } /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some /// f32 and even f64 comparisons to integer ones. SDValue ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc dl(Op); bool LHSSeenZero = false; bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget); bool RHSSeenZero = false; bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget); if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) { // If unsafe fp math optimization is enabled and there are no other uses of // the CMP operands, and the condition code is EQ or NE, we can optimize it // to an integer comparison. if (CC == ISD::SETOEQ) CC = ISD::SETEQ; else if (CC == ISD::SETUNE) CC = ISD::SETNE; SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32); SDValue ARMcc; if (LHS.getValueType() == MVT::f32) { LHS = DAG.getNode(ISD::AND, dl, MVT::i32, bitcastf32Toi32(LHS, DAG), Mask); RHS = DAG.getNode(ISD::AND, dl, MVT::i32, bitcastf32Toi32(RHS, DAG), Mask); SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, Cmp); } SDValue LHS1, LHS2; SDValue RHS1, RHS2; expandf64Toi32(LHS, DAG, LHS1, LHS2); expandf64Toi32(RHS, DAG, RHS1, RHS2); LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask); RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask); ARMCC::CondCodes CondCode = IntCCToARMCC(CC); ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest }; return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops); } return SDValue(); } SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Cond = Op.getOperand(1); SDValue Dest = Op.getOperand(2); SDLoc dl(Op); // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. unsigned Opc = Cond.getOpcode(); bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && !Subtarget->isThumb1Only(); if (Cond.getResNo() == 1 && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || OptimizeMul)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0))) return SDValue(); // The actual operation with overflow check. SDValue Value, OverflowCmp; SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc); // Reverse the condition code. ARMCC::CondCodes CondCode = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); CondCode = ARMCC::getOppositeCondition(CondCode); ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, OverflowCmp); } return SDValue(); } SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); ISD::CondCode CC = cast(Op.getOperand(1))->get(); SDValue LHS = Op.getOperand(2); SDValue RHS = Op.getOperand(3); SDValue Dest = Op.getOperand(4); SDLoc dl(Op); if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) { DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC, dl); // If softenSetCCOperands only returned one value, we should compare it to // zero. if (!RHS.getNode()) { RHS = DAG.getConstant(0, dl, LHS.getValueType()); CC = ISD::SETNE; } } // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch // instruction. unsigned Opc = LHS.getOpcode(); bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) && !Subtarget->isThumb1Only(); if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) && (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || Opc == ISD::USUBO || OptimizeMul) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { // Only lower legal XALUO ops. if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) return SDValue(); // The actual operation with overflow check. SDValue Value, OverflowCmp; SDValue ARMcc; std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc); if ((CC == ISD::SETNE) != isOneConstant(RHS)) { // Reverse the condition code. ARMCC::CondCodes CondCode = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); CondCode = ARMCC::getOppositeCondition(CondCode); ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32); } SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, OverflowCmp); } if (LHS.getValueType() == MVT::i32) { SDValue ARMcc; SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR, Cmp); } if (getTargetMachine().Options.UnsafeFPMath && (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE || CC == ISD::SETUNE)) { if (SDValue Result = OptimizeVFPBrcond(Op, DAG)) return Result; } ARMCC::CondCodes CondCode, CondCode2; bool InvalidOnQNaN; FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN); SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32); SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue); SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp }; SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); if (CondCode2 != ARMCC::AL) { ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32); SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) }; Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops); } return Res; } SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDValue Table = Op.getOperand(1); SDValue Index = Op.getOperand(2); SDLoc dl(Op); EVT PTy = getPointerTy(DAG.getDataLayout()); JumpTableSDNode *JT = cast(Table); SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy); Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI); Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy)); SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index); if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) { // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table // which does another jump to the destination. This also makes it easier // to translate it to TBB / TBH later (Thumb2 only). // FIXME: This might not work if the function is extremely large. return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain, Addr, Op.getOperand(2), JTI); } if (isPositionIndependent() || Subtarget->isROPI()) { Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr, MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); Chain = Addr.getValue(1); Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } else { Addr = DAG.getLoad(PTy, dl, Chain, Addr, MachinePointerInfo::getJumpTable(DAG.getMachineFunction())); Chain = Addr.getValue(1); return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI); } } static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); SDLoc dl(Op); if (Op.getValueType().getVectorElementType() == MVT::i32) { if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32) return Op; return DAG.UnrollVectorOp(Op.getNode()); } assert(Op.getOperand(0).getValueType() == MVT::v4f32 && "Invalid type for custom lowering!"); if (VT != MVT::v4i16) return DAG.UnrollVectorOp(Op.getNode()); Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0)); return DAG.getNode(ISD::TRUNCATE, dl, VT, Op); } SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorFP_TO_INT(Op, DAG); if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::FP_TO_SINT) LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } return Op; } static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); SDLoc dl(Op); if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) { if (VT.getVectorElementType() == MVT::f32) return Op; return DAG.UnrollVectorOp(Op.getNode()); } assert(Op.getOperand(0).getValueType() == MVT::v4i16 && "Invalid type for custom lowering!"); if (VT != MVT::v4f32) return DAG.UnrollVectorOp(Op.getNode()); unsigned CastOpc; unsigned Opc; switch (Op.getOpcode()) { default: llvm_unreachable("Invalid opcode!"); case ISD::SINT_TO_FP: CastOpc = ISD::SIGN_EXTEND; Opc = ISD::SINT_TO_FP; break; case ISD::UINT_TO_FP: CastOpc = ISD::ZERO_EXTEND; Opc = ISD::UINT_TO_FP; break; } Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0)); return DAG.getNode(Opc, dl, VT, Op); } SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); if (VT.isVector()) return LowerVectorINT_TO_FP(Op, DAG); if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) { RTLIB::Libcall LC; if (Op.getOpcode() == ISD::SINT_TO_FP) LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); else LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0), /*isSigned*/ false, SDLoc(Op)).first; } return Op; } SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const { // Implement fcopysign with a fabs and a conditional fneg. SDValue Tmp0 = Op.getOperand(0); SDValue Tmp1 = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); EVT SrcVT = Tmp1.getValueType(); bool InGPR = Tmp0.getOpcode() == ISD::BITCAST || Tmp0.getOpcode() == ARMISD::VMOVDRR; bool UseNEON = !InGPR && Subtarget->hasNEON(); if (UseNEON) { // Use VBSL to copy the sign bit. unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80); SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32, DAG.getTargetConstant(EncodedVal, dl, MVT::i32)); EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64; if (VT == MVT::f64) Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Mask), DAG.getConstant(32, dl, MVT::i32)); else /*if (VT == MVT::f32)*/ Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0); if (SrcVT == MVT::f32) { Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1); if (VT == MVT::f64) Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1), DAG.getConstant(32, dl, MVT::i32)); } else if (VT == MVT::f32) Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64, DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1), DAG.getConstant(32, dl, MVT::i32)); Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0); Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1); SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff), dl, MVT::i32); AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes); SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask, DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes)); SDValue Res = DAG.getNode(ISD::OR, dl, OpVT, DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask), DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot)); if (VT == MVT::f32) { Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res); Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, DAG.getConstant(0, dl, MVT::i32)); } else { Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res); } return Res; } // Bitcast operand 1 to i32. if (SrcVT == MVT::f64) Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Tmp1).getValue(1); Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1); // Or in the signbit with integer operations. SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32); SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32); Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1); if (VT == MVT::f32) { Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32, DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2); return DAG.getNode(ISD::BITCAST, dl, MVT::f32, DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1)); } // f64: Or the high part with signbit and then combine two parts. Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Tmp0); SDValue Lo = Tmp0.getValue(0); SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2); Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1); return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi); } SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{ MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setReturnAddressIsTaken(true); if (verifyReturnAddressArgumentIsConstant(Op, DAG)) return SDValue(); EVT VT = Op.getValueType(); SDLoc dl(Op); unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); if (Depth) { SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); SDValue Offset = DAG.getConstant(4, dl, MVT::i32); return DAG.getLoad(VT, dl, DAG.getEntryNode(), DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset), MachinePointerInfo()); } // Return LR, which contains the return address. Mark it an implicit live-in. unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32)); return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); } SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { const ARMBaseRegisterInfo &ARI = *static_cast(RegInfo); MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); MFI.setFrameAddressIsTaken(true); EVT VT = Op.getValueType(); SDLoc dl(Op); // FIXME probably not meaningful unsigned Depth = cast(Op.getOperand(0))->getZExtValue(); unsigned FrameReg = ARI.getFrameRegister(MF); SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); while (Depth--) FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, MachinePointerInfo()); return FrameAddr; } // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const { unsigned Reg = StringSwitch(RegName) .Case("sp", ARM::SP) .Default(0); if (Reg) return Reg; report_fatal_error(Twine("Invalid register name \"" + StringRef(RegName) + "\".")); } // Result is 64 bit value so split into two 32 bit values and return as a // pair of values. static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) { SDLoc DL(N); // This function is only supposed to be called for i64 type destination. assert(N->getValueType(0) == MVT::i64 && "ExpandREAD_REGISTER called for non-i64 type result."); SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL, DAG.getVTList(MVT::i32, MVT::i32, MVT::Other), N->getOperand(0), N->getOperand(1)); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0), Read.getValue(1))); Results.push_back(Read.getOperand(0)); } /// \p BC is a bitcast that is about to be turned into a VMOVDRR. /// When \p DstVT, the destination type of \p BC, is on the vector /// register bank and the source of bitcast, \p Op, operates on the same bank, /// it might be possible to combine them, such that everything stays on the /// vector register bank. /// \p return The node that would replace \p BT, if the combine /// is possible. static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG) { SDValue Op = BC->getOperand(0); EVT DstVT = BC->getValueType(0); // The only vector instruction that can produce a scalar (remember, // since the bitcast was about to be turned into VMOVDRR, the source // type is i64) from a vector is EXTRACT_VECTOR_ELT. // Moreover, we can do this combine only if there is one use. // Finally, if the destination type is not a vector, there is not // much point on forcing everything on the vector bank. if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !Op.hasOneUse()) return SDValue(); // If the index is not constant, we will introduce an additional // multiply that will stick. // Give up in that case. ConstantSDNode *Index = dyn_cast(Op.getOperand(1)); if (!Index) return SDValue(); unsigned DstNumElt = DstVT.getVectorNumElements(); // Compute the new index. const APInt &APIntIndex = Index->getAPIntValue(); APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt); NewIndex *= APIntIndex; // Check if the new constant index fits into i32. if (NewIndex.getBitWidth() > 32) return SDValue(); // vMTy bitcast(i64 extractelt vNi64 src, i32 index) -> // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M) SDLoc dl(Op); SDValue ExtractSrc = Op.getOperand(0); EVT VecVT = EVT::getVectorVT( *DAG.getContext(), DstVT.getScalarType(), ExtractSrc.getValueType().getVectorNumElements() * DstNumElt); SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast, DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32)); } /// ExpandBITCAST - If the target supports VFP, this function is called to /// expand a bit convert where either the source or destination type is i64 to /// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64 /// operand type is illegal (e.g., v2f32 for a target that doesn't support /// vectors), since the legalizer won't know what to do with that. static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); SDValue Op = N->getOperand(0); // This function is only supposed to be called for i64 types, either as the // source or destination of the bit convert. EVT SrcVT = Op.getValueType(); EVT DstVT = N->getValueType(0); const bool HasFullFP16 = Subtarget->hasFullFP16(); if (SrcVT == MVT::f32 && DstVT == MVT::i32) { // FullFP16: half values are passed in S-registers, and we don't // need any of the bitcast and moves: // // t2: f32,ch = CopyFromReg t0, Register:f32 %0 // t5: i32 = bitcast t2 // t18: f16 = ARMISD::VMOVhr t5 if (Op.getOpcode() != ISD::CopyFromReg || Op.getValueType() != MVT::f32) return SDValue(); auto Move = N->use_begin(); if (Move->getOpcode() != ARMISD::VMOVhr) return SDValue(); SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) }; SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops); DAG.ReplaceAllUsesWith(*Move, &Copy); return Copy; } if (SrcVT == MVT::i16 && DstVT == MVT::f16) { if (!HasFullFP16) return SDValue(); // SoftFP: read half-precision arguments: // // t2: i32,ch = ... // t7: i16 = truncate t2 <~~~~ Op // t8: f16 = bitcast t7 <~~~~ N // if (Op.getOperand(0).getValueType() == MVT::i32) return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op), MVT::f16, Op.getOperand(0)); return SDValue(); } // Half-precision return values if (SrcVT == MVT::f16 && DstVT == MVT::i16) { if (!HasFullFP16) return SDValue(); // // t11: f16 = fadd t8, t10 // t12: i16 = bitcast t11 <~~~ SDNode N // t13: i32 = zero_extend t12 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13 // t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1 // // transform this into: // // t20: i32 = ARMISD::VMOVrh t11 // t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20 // auto ZeroExtend = N->use_begin(); if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND || ZeroExtend->getValueType(0) != MVT::i32) return SDValue(); auto Copy = ZeroExtend->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg && Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) { SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op); DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt); return Cvt; } return SDValue(); } if (!(SrcVT == MVT::i64 || DstVT == MVT::i64)) return SDValue(); // Turn i64->f64 into VMOVDRR. if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) { // Do not force values to GPRs (this is what VMOVDRR does for the inputs) // if we can combine the bitcast with its source. if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG)) return Val; SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op, DAG.getConstant(1, dl, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, DstVT, DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi)); } // Turn f64->i64 into VMOVRRD. if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) { SDValue Cvt; if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() && SrcVT.getVectorNumElements() > 1) Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op)); else Cvt = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32), Op); // Merge the pieces into a single i64 value. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1)); } return SDValue(); } /// getZeroVector - Returns a vector of specified type with all zero elements. /// Zero vectors are used to represent vector negation and in those cases /// will be implemented with the NEON VNEG instruction. However, VNEG does /// not support i64 elements, so sometimes the zero vectors will need to be /// explicitly constructed. Regardless, use a canonical VMOV to create the /// zero vector. static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { assert(VT.isVector() && "Expected a vector type"); // The canonical modified immediate encoding of a zero vector is....0! SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32); EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } /// LowerShiftRightParts - Lower SRA_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, DAG.getConstant(VTBits, dl, MVT::i32)); SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), ISD::SETGE, ARMcc, DAG, dl); SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CCR, CmpLo); SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); SDValue HiBigShift = Opc == ISD::SRA ? DAG.getNode(Opc, dl, VT, ShOpHi, DAG.getConstant(VTBits - 1, dl, VT)) : DAG.getConstant(0, dl, VT); SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), ISD::SETGE, ARMcc, DAG, dl); SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CCR, CmpHi); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two /// i32 values and take a 2 x i32 value to shift plus a shift amount. SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 3 && "Not a double-shift!"); EVT VT = Op.getValueType(); unsigned VTBits = VT.getSizeInBits(); SDLoc dl(Op); SDValue ShOpLo = Op.getOperand(0); SDValue ShOpHi = Op.getOperand(1); SDValue ShAmt = Op.getOperand(2); SDValue ARMcc; SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); assert(Op.getOpcode() == ISD::SHL_PARTS); SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(VTBits, dl, MVT::i32), ShAmt); SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt, DAG.getConstant(VTBits, dl, MVT::i32)); SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), ISD::SETGE, ARMcc, DAG, dl); SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CCR, CmpHi); SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32), ISD::SETGE, ARMcc, DAG, dl); SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo); SDValue Ops[2] = { Lo, Hi }; return DAG.getMergeValues(Ops, dl); } SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const { // The rounding mode is in bits 23:22 of the FPSCR. // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3) // so that the shift + and get folded into a bitfield extract. SDLoc dl(Op); SDValue Ops[] = { DAG.getEntryNode(), DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) }; SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops); SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR, DAG.getConstant(1U << 22, dl, MVT::i32)); SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds, DAG.getConstant(22, dl, MVT::i32)); return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE, DAG.getConstant(3, dl, MVT::i32)); } static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { SDLoc dl(N); EVT VT = N->getValueType(0); if (VT.isVector()) { assert(ST->hasNEON()); // Compute the least significant set bit: LSB = X & -X SDValue X = N->getOperand(0); SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X); SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX); EVT ElemTy = VT.getVectorElementType(); if (ElemTy == MVT::i8) { // Compute with: cttz(x) = ctpop(lsb - 1) SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(1, dl, ElemTy)); SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); return DAG.getNode(ISD::CTPOP, dl, VT, Bits); } if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) && (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) { // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0 unsigned NumBits = ElemTy.getSizeInBits(); SDValue WidthMinus1 = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(NumBits - 1, dl, ElemTy)); SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB); return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ); } // Compute with: cttz(x) = ctpop(lsb - 1) // Since we can only compute the number of bits in a byte with vcnt.8, we // have to gather the result with pairwise addition (vpaddl) for i16, i32, // and i64. // Compute LSB - 1. SDValue Bits; if (ElemTy == MVT::i64) { // Load constant 0xffff'ffff'ffff'ffff to register. SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(0x1eff, dl, MVT::i32)); Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF); } else { SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT, DAG.getTargetConstant(1, dl, ElemTy)); Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One); } // Count #bits with vcnt.8. EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits); SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8); // Gather the #bits with vpaddl (pairwise add.) EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit, DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), Cnt8); if (ElemTy == MVT::i16) return Cnt16; EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32; SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit, DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), Cnt16); if (ElemTy == MVT::i32) return Cnt32; assert(ElemTy == MVT::i64); SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32), Cnt32); return Cnt64; } if (!ST->hasV6T2Ops()) return SDValue(); SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0)); return DAG.getNode(ISD::CTLZ, dl, VT, rbit); } /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count /// for each 16-bit element from operand, repeated. The basic idea is to /// leverage vcnt to get the 8-bit counts, gather and add the results. /// /// Trace for v4i16: /// input = [v0 v1 v2 v3 ] (vi 16-bit element) /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element) /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi) /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6] /// [b0 b1 b2 b3 b4 b5 b6 b7] /// +[b1 b0 b3 b2 b5 b4 b7 b6] /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0, /// vuzp: = [k0 k1 k2 k3 k0 k1 k2 k3] each ki is 8-bits) static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDLoc DL(N); EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8; SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0)); SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0); SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1); SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2); return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3); } /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the /// bit-count for each 16-bit element from the operand. We need slightly /// different sequencing for v4i16 and v8i16 to stay within NEON's available /// 64/128-bit registers. /// /// Trace for v4i16: /// input = [v0 v1 v2 v3 ] (vi 16-bit element) /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi) /// v8i16:Extended = [k0 k1 k2 k3 k0 k1 k2 k3 ] /// v4i16:Extracted = [k0 k1 k2 k3 ] static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDLoc DL(N); SDValue BitCounts = getCTPOP16BitCounts(N, DAG); if (VT.is64BitVector()) { SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended, DAG.getIntPtrConstant(0, DL)); } else { SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, BitCounts, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted); } } /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the /// bit-count for each 32-bit element from the operand. The idea here is /// to split the vector into 16-bit elements, leverage the 16-bit count /// routine, and then combine the results. /// /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged): /// input = [v0 v1 ] (vi: 32-bit elements) /// Bitcast = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1]) /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi) /// vrev: N0 = [k1 k0 k3 k2 ] /// [k0 k1 k2 k3 ] /// N1 =+[k1 k0 k3 k2 ] /// [k0 k2 k1 k3 ] /// N2 =+[k1 k3 k0 k2 ] /// [k0 k2 k1 k3 ] /// Extended =+[k1 k3 k0 k2 ] /// [k0 k2 ] /// Extracted=+[k1 k3 ] /// static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); SDLoc DL(N); EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16; SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0)); SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG); SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16); SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0); SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1); if (VT.is64BitVector()) { SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended, DAG.getIntPtrConstant(0, DL)); } else { SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2, DAG.getIntPtrConstant(0, DL)); return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted); } } static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); assert(ST->hasNEON() && "Custom ctpop lowering requires NEON."); assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) && "Unexpected type for custom ctpop lowering"); if (VT.getVectorElementType() == MVT::i32) return lowerCTPOP32BitElements(N, DAG); else return lowerCTPOP16BitElements(N, DAG); } static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc dl(N); if (!VT.isVector()) return SDValue(); // Lower vector shifts on NEON to use VSHL. assert(ST->hasNEON() && "unexpected vector shift"); // Left shifts translate directly to the vshiftu intrinsic. if (N->getOpcode() == ISD::SHL) return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl, MVT::i32), N->getOperand(0), N->getOperand(1)); assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode"); // NEON uses the same intrinsics for both left and right shifts. For // right shifts, the shift amounts are negative, so negate the vector of // shift amounts. EVT ShiftVT = N->getOperand(1).getValueType(); SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1)); Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ? Intrinsic::arm_neon_vshifts : Intrinsic::arm_neon_vshiftu); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, DAG.getConstant(vshiftInt, dl, MVT::i32), N->getOperand(0), NegatedCount); } static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); SDLoc dl(N); // We can get here for a node like i32 = ISD::SHL i32, i64 if (VT != MVT::i64) return SDValue(); assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && "Unknown shift to lower!"); // We only lower SRA, SRL of 1 here, all others use generic lowering. if (!isOneConstant(N->getOperand(1))) return SDValue(); // If we are in thumb mode, we don't have RRX. if (ST->isThumb1Only()) return SDValue(); // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr. SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0), DAG.getConstant(1, dl, MVT::i32)); // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and // captures the result into a carry flag. unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG; Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi); // The low part is an ARMISD::RRX operand, which shifts the carry in. Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1)); // Merge the pieces into a single i64 value. return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); } static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { SDValue TmpOp0, TmpOp1; bool Invert = false; bool Swap = false; unsigned Opc = 0; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); SDValue CC = Op.getOperand(2); EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger(); EVT VT = Op.getValueType(); ISD::CondCode SetCCOpcode = cast(CC)->get(); SDLoc dl(Op); if (Op0.getValueType().getVectorElementType() == MVT::i64 && (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) { // Special-case integer 64-bit equality comparisons. They aren't legal, // but they can be lowered with a few vector instructions. unsigned CmpElements = CmpVT.getVectorNumElements() * 2; EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements); SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0); SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1); SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1, DAG.getCondCode(ISD::SETEQ)); SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp); SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed); Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged); if (SetCCOpcode == ISD::SETNE) Merged = DAG.getNOT(dl, Merged, CmpVT); Merged = DAG.getSExtOrTrunc(Merged, dl, VT); return Merged; } if (CmpVT.getVectorElementType() == MVT::i64) // 64-bit comparisons are not legal in general. return SDValue(); if (Op1.getValueType().isFloatingPoint()) { switch (SetCCOpcode) { default: llvm_unreachable("Illegal FP comparison"); case ISD::SETUNE: case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; case ISD::SETOEQ: case ISD::SETEQ: Opc = ARMISD::VCEQ; break; case ISD::SETOLT: case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETOGT: case ISD::SETGT: Opc = ARMISD::VCGT; break; case ISD::SETOLE: case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETOGE: case ISD::SETGE: Opc = ARMISD::VCGE; break; case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break; case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break; case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH; case ISD::SETONE: // Expand this to (OLT | OGT). TmpOp0 = Op0; TmpOp1 = Op1; Opc = ISD::OR; Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1); break; case ISD::SETUO: Invert = true; LLVM_FALLTHROUGH; case ISD::SETO: // Expand this to (OLT | OGE). TmpOp0 = Op0; TmpOp1 = Op1; Opc = ISD::OR; Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0); Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1); break; } } else { // Integer comparisons. switch (SetCCOpcode) { default: llvm_unreachable("Illegal integer comparison"); case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; case ISD::SETEQ: Opc = ARMISD::VCEQ; break; case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGT: Opc = ARMISD::VCGT; break; case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGE: Opc = ARMISD::VCGE; break; case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: Opc = ARMISD::VCGTU; break; case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGE: Opc = ARMISD::VCGEU; break; } // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero). if (Opc == ARMISD::VCEQ) { SDValue AndOp; if (ISD::isBuildVectorAllZeros(Op1.getNode())) AndOp = Op0; else if (ISD::isBuildVectorAllZeros(Op0.getNode())) AndOp = Op1; // Ignore bitconvert. if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST) AndOp = AndOp.getOperand(0); if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) { Opc = ARMISD::VTST; Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0)); Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1)); Invert = !Invert; } } } if (Swap) std::swap(Op0, Op1); // If one of the operands is a constant vector zero, attempt to fold the // comparison to a specialized compare-against-zero form. SDValue SingleOp; if (ISD::isBuildVectorAllZeros(Op1.getNode())) SingleOp = Op0; else if (ISD::isBuildVectorAllZeros(Op0.getNode())) { if (Opc == ARMISD::VCGE) Opc = ARMISD::VCLEZ; else if (Opc == ARMISD::VCGT) Opc = ARMISD::VCLTZ; SingleOp = Op1; } SDValue Result; if (SingleOp.getNode()) { switch (Opc) { case ARMISD::VCEQ: Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break; case ARMISD::VCGE: Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break; case ARMISD::VCLEZ: Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break; case ARMISD::VCGT: Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break; case ARMISD::VCLTZ: Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break; default: Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); } } else { Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1); } Result = DAG.getSExtOrTrunc(Result, dl, VT); if (Invert) Result = DAG.getNOT(dl, Result, VT); return Result; } static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); SDValue Carry = Op.getOperand(2); SDValue Cond = Op.getOperand(3); SDLoc DL(Op); assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only."); // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we // have to invert the carry first. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), Carry); // This converts the boolean value carry into the carry flag. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry); SDValue FVal = DAG.getConstant(0, DL, MVT::i32); SDValue TVal = DAG.getConstant(1, DL, MVT::i32); SDValue ARMcc = DAG.getConstant( IntCCToARMCC(cast(Cond)->get()), DL, MVT::i32); SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32); SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR, Cmp.getValue(1), SDValue()); return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc, CCR, Chain.getValue(1)); } /// isNEONModifiedImm - Check if the specified splat value corresponds to a /// valid vector constant for a NEON instruction with a "modified immediate" /// operand (e.g., VMOV). If so, return the encoded value. static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, bool is128Bits, NEONModImmType type) { unsigned OpCmode, Imm; // SplatBitSize is set to the smallest size that splats the vector, so a // zero vector will always have SplatBitSize == 8. However, NEON modified // immediate instructions others than VMOV do not support the 8-bit encoding // of a zero vector, and the default encoding of zero is supposed to be the // 32-bit version. if (SplatBits == 0) SplatBitSize = 32; switch (SplatBitSize) { case 8: if (type != VMOVModImm) return SDValue(); // Any 1-byte value is OK. Op=0, Cmode=1110. assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big"); OpCmode = 0xe; Imm = SplatBits; VT = is128Bits ? MVT::v16i8 : MVT::v8i8; break; case 16: // NEON's 16-bit VMOV supports splat values where only one byte is nonzero. VT = is128Bits ? MVT::v8i16 : MVT::v4i16; if ((SplatBits & ~0xff) == 0) { // Value = 0x00nn: Op=x, Cmode=100x. OpCmode = 0x8; Imm = SplatBits; break; } if ((SplatBits & ~0xff00) == 0) { // Value = 0xnn00: Op=x, Cmode=101x. OpCmode = 0xa; Imm = SplatBits >> 8; break; } return SDValue(); case 32: // NEON's 32-bit VMOV supports splat values where: // * only one byte is nonzero, or // * the least significant byte is 0xff and the second byte is nonzero, or // * the least significant 2 bytes are 0xff and the third is nonzero. VT = is128Bits ? MVT::v4i32 : MVT::v2i32; if ((SplatBits & ~0xff) == 0) { // Value = 0x000000nn: Op=x, Cmode=000x. OpCmode = 0; Imm = SplatBits; break; } if ((SplatBits & ~0xff00) == 0) { // Value = 0x0000nn00: Op=x, Cmode=001x. OpCmode = 0x2; Imm = SplatBits >> 8; break; } if ((SplatBits & ~0xff0000) == 0) { // Value = 0x00nn0000: Op=x, Cmode=010x. OpCmode = 0x4; Imm = SplatBits >> 16; break; } if ((SplatBits & ~0xff000000) == 0) { // Value = 0xnn000000: Op=x, Cmode=011x. OpCmode = 0x6; Imm = SplatBits >> 24; break; } // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC if (type == OtherModImm) return SDValue(); if ((SplatBits & ~0xffff) == 0 && ((SplatBits | SplatUndef) & 0xff) == 0xff) { // Value = 0x0000nnff: Op=x, Cmode=1100. OpCmode = 0xc; Imm = SplatBits >> 8; break; } if ((SplatBits & ~0xffffff) == 0 && ((SplatBits | SplatUndef) & 0xffff) == 0xffff) { // Value = 0x00nnffff: Op=x, Cmode=1101. OpCmode = 0xd; Imm = SplatBits >> 16; break; } // Note: there are a few 32-bit splat values (specifically: 00ffff00, // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not // VMOV.I32. A (very) minor optimization would be to replicate the value // and fall through here to test for a valid 64-bit splat. But, then the // caller would also need to check and handle the change in size. return SDValue(); case 64: { if (type != VMOVModImm) return SDValue(); // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff. uint64_t BitMask = 0xff; uint64_t Val = 0; unsigned ImmMask = 1; Imm = 0; for (int ByteNum = 0; ByteNum < 8; ++ByteNum) { if (((SplatBits | SplatUndef) & BitMask) == BitMask) { Val |= BitMask; Imm |= ImmMask; } else if ((SplatBits & BitMask) != 0) { return SDValue(); } BitMask <<= 8; ImmMask <<= 1; } if (DAG.getDataLayout().isBigEndian()) // swap higher and lower 32 bit word Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4); // Op=1, Cmode=1110. OpCmode = 0x1e; VT = is128Bits ? MVT::v2i64 : MVT::v1i64; break; } default: llvm_unreachable("unexpected size for isNEONModifiedImm"); } unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm); return DAG.getTargetConstant(EncodedVal, dl, MVT::i32); } SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { EVT VT = Op.getValueType(); bool IsDouble = (VT == MVT::f64); ConstantFPSDNode *CFP = cast(Op); const APFloat &FPVal = CFP->getValueAPF(); // Prevent floating-point constants from using literal loads // when execute-only is enabled. if (ST->genExecuteOnly()) { // If we can represent the constant as an immediate, don't lower it if (isFPImmLegal(FPVal, VT)) return Op; // Otherwise, construct as integer, and move to float register APInt INTVal = FPVal.bitcastToAPInt(); SDLoc DL(CFP); switch (VT.getSimpleVT().SimpleTy) { default: llvm_unreachable("Unknown floating point type!"); break; case MVT::f64: { SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32); SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32); if (!ST->isLittle()) std::swap(Lo, Hi); return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi); } case MVT::f32: return DAG.getNode(ARMISD::VMOVSR, DL, VT, DAG.getConstant(INTVal, DL, MVT::i32)); } } if (!ST->hasVFP3()) return SDValue(); // Use the default (constant pool) lowering for double constants when we have // an SP-only FPU if (IsDouble && Subtarget->isFPOnlySP()) return SDValue(); // Try splatting with a VMOV.f32... int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal); if (ImmVal != -1) { if (IsDouble || !ST->useNEONForSinglePrecisionFP()) { // We have code in place to select a valid ConstantFP already, no need to // do any mangling. return Op; } // It's a float and we are trying to use NEON operations where // possible. Lower it to a splat followed by an extract. SDLoc DL(Op); SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32); SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32, NewVal); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant, DAG.getConstant(0, DL, MVT::i32)); } // The rest of our options are NEON only, make sure that's allowed before // proceeding.. if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP())) return SDValue(); EVT VMovVT; uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue(); // It wouldn't really be worth bothering for doubles except for one very // important value, which does happen to match: 0.0. So make sure we don't do // anything stupid. if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32)) return SDValue(); // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too). SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMOVModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT, NewVal); if (IsDouble) return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); // It's a float: cast and extract a vector element. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, DAG.getConstant(0, DL, MVT::i32)); } // Finally, try a VMVN.i32 NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT, false, VMVNModImm); if (NewVal != SDValue()) { SDLoc DL(Op); SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal); if (IsDouble) return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant); // It's a float: cast and extract a vector element. SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, VecConstant); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant, DAG.getConstant(0, DL, MVT::i32)); } return SDValue(); } // check if an VEXT instruction can handle the shuffle mask when the // vector sources of the shuffle are the same. static bool isSingletonVEXTMask(ArrayRef M, EVT VT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); // Assume that the first shuffle index is not UNDEF. Fail if it is. if (M[0] < 0) return false; Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first // element. The other shuffle indices must be the successive elements after // the first one. unsigned ExpectedElt = Imm; for (unsigned i = 1; i < NumElts; ++i) { // Increment the expected index. If it wraps around, just follow it // back to index zero and keep going. ++ExpectedElt; if (ExpectedElt == NumElts) ExpectedElt = 0; if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } return true; } static bool isVEXTMask(ArrayRef M, EVT VT, bool &ReverseVEXT, unsigned &Imm) { unsigned NumElts = VT.getVectorNumElements(); ReverseVEXT = false; // Assume that the first shuffle index is not UNDEF. Fail if it is. if (M[0] < 0) return false; Imm = M[0]; // If this is a VEXT shuffle, the immediate value is the index of the first // element. The other shuffle indices must be the successive elements after // the first one. unsigned ExpectedElt = Imm; for (unsigned i = 1; i < NumElts; ++i) { // Increment the expected index. If it wraps around, it may still be // a VEXT but the source vectors must be swapped. ExpectedElt += 1; if (ExpectedElt == NumElts * 2) { ExpectedElt = 0; ReverseVEXT = true; } if (M[i] < 0) continue; // ignore UNDEF indices if (ExpectedElt != static_cast(M[i])) return false; } // Adjust the index value if the source operands will be swapped. if (ReverseVEXT) Imm -= NumElts; return true; } /// isVREVMask - Check if a vector shuffle corresponds to a VREV /// instruction with the specified blocksize. (The order of the elements /// within each block of the vector is reversed.) static bool isVREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { assert((BlockSize==16 || BlockSize==32 || BlockSize==64) && "Only possible block sizes for VREV are: 16, 32, 64"); unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); unsigned BlockElts = M[0] + 1; // If the first shuffle index is UNDEF, be optimistic. if (M[0] < 0) BlockElts = BlockSize / EltSz; if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) return false; for (unsigned i = 0; i < NumElts; ++i) { if (M[i] < 0) continue; // ignore UNDEF indices if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts)) return false; } return true; } static bool isVTBLMask(ArrayRef M, EVT VT) { // We can handle <8 x i8> vector shuffles. If the index in the mask is out of // range, then 0 is placed into the resulting vector. So pretty much any mask // of 8 elements can work here. return VT == MVT::v8i8 && M.size() == 8; } static unsigned SelectPairHalf(unsigned Elements, ArrayRef Mask, unsigned Index) { if (Mask.size() == Elements * 2) return Index / Elements; return Mask[Index] == 0 ? 0 : 1; } // Checks whether the shuffle mask represents a vector transpose (VTRN) by // checking that pairs of elements in the shuffle mask represent the same index // in each vector, incrementing the expected index by 2 at each step. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6] // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g} // v2={e,f,g,h} // WhichResult gives the offset for each element in the mask based on which // of the two results it belongs to. // // The transpose can be represented either as: // result1 = shufflevector v1, v2, result1_shuffle_mask // result2 = shufflevector v1, v2, result2_shuffle_mask // where v1/v2 and the shuffle masks have the same number of elements // (here WhichResult (see below) indicates which result is being checked) // // or as: // results = shufflevector v1, v2, shuffle_mask // where both results are returned in one vector and the shuffle mask has twice // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we // want to check the low half and high half of the shuffle mask as if it were // the other case static bool isVTRNMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; // If the mask is twice as long as the input vector then we need to check the // upper and lower parts of the mask with a matching value for WhichResult // FIXME: A mask with only even values will be rejected in case the first // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only // M[0] is used to determine WhichResult for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult)) return false; } } if (M.size() == NumElts*2) WhichResult = 0; return true; } /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. static bool isVTRN_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult)) return false; } } if (M.size() == NumElts*2) WhichResult = 0; return true; } // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking // that the mask elements are either all even and in steps of size 2 or all odd // and in steps of size 2. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6] // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g} // v2={e,f,g,h} // Requires similar checks to that of isVTRNMask with // respect the how results are returned. static bool isVUZPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); for (unsigned j = 0; j < NumElts; ++j) { if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult) return false; } } if (M.size() == NumElts*2) WhichResult = 0; // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, static bool isVUZP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; unsigned Half = NumElts / 2; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); for (unsigned j = 0; j < NumElts; j += Half) { unsigned Idx = WhichResult; for (unsigned k = 0; k < Half; ++k) { int MIdx = M[i + j + k]; if (MIdx >= 0 && (unsigned) MIdx != Idx) return false; Idx += 2; } } } if (M.size() == NumElts*2) WhichResult = 0; // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } // Checks whether the shuffle mask represents a vector zip (VZIP) by checking // that pairs of elements of the shufflemask represent the same index in each // vector incrementing sequentially through the vectors. // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5] // v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f} // v2={e,f,g,h} // Requires similar checks to that of isVTRNMask with respect the how results // are returned. static bool isVZIPMask(ArrayRef M, EVT VT, unsigned &WhichResult) { unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); unsigned Idx = WhichResult * NumElts / 2; for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts)) return false; Idx += 1; } } if (M.size() == NumElts*2) WhichResult = 0; // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. static bool isVZIP_v_undef_Mask(ArrayRef M, EVT VT, unsigned &WhichResult){ unsigned EltSz = VT.getScalarSizeInBits(); if (EltSz == 64) return false; unsigned NumElts = VT.getVectorNumElements(); if (M.size() != NumElts && M.size() != NumElts*2) return false; for (unsigned i = 0; i < M.size(); i += NumElts) { WhichResult = SelectPairHalf(NumElts, M, i); unsigned Idx = WhichResult * NumElts / 2; for (unsigned j = 0; j < NumElts; j += 2) { if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) || (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx)) return false; Idx += 1; } } if (M.size() == NumElts*2) WhichResult = 0; // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. if (VT.is64BitVector() && EltSz == 32) return false; return true; } /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't. static unsigned isNEONTwoResultShuffleMask(ArrayRef ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF) { isV_UNDEF = false; if (isVTRNMask(ShuffleMask, VT, WhichResult)) return ARMISD::VTRN; if (isVUZPMask(ShuffleMask, VT, WhichResult)) return ARMISD::VUZP; if (isVZIPMask(ShuffleMask, VT, WhichResult)) return ARMISD::VZIP; isV_UNDEF = true; if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) return ARMISD::VTRN; if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) return ARMISD::VUZP; if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) return ARMISD::VZIP; return 0; } /// \return true if this is a reverse operation on an vector. static bool isReverseMask(ArrayRef M, EVT VT) { unsigned NumElts = VT.getVectorNumElements(); // Make sure the mask has the right size. if (NumElts != M.size()) return false; // Look for <15, ..., 3, -1, 1, 0>. for (unsigned i = 0; i != NumElts; ++i) if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i)) return false; return true; } // If N is an integer constant that can be moved into a register in one // instruction, return an SDValue of such a constant (will become a MOV // instruction). Otherwise return null. static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl) { uint64_t Val; if (!isa(N)) return SDValue(); Val = cast(N)->getZExtValue(); if (ST->isThumb1Only()) { if (Val <= 255 || ~Val <= 255) return DAG.getConstant(Val, dl, MVT::i32); } else { if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1) return DAG.getConstant(Val, dl, MVT::i32); } return SDValue(); } // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const { BuildVectorSDNode *BVN = cast(Op.getNode()); SDLoc dl(Op); EVT VT = Op.getValueType(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatUndef.isAllOnesValue()) return DAG.getUNDEF(VT); if (SplatBitSize <= 64) { // Check if an immediate VMOV works. EVT VmovVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), VMOVModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } // Try an immediate VMVN. uint64_t NegatedImm = (~SplatBits).getZExtValue(); Val = isNEONModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT, VT.is128BitVector(), VMVNModImm); if (Val.getNode()) { SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vmov); } // Use vmov.f32 to materialize other v2f32 and v4f32 splats. if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) { int ImmVal = ARM_AM::getFP32Imm(SplatBits); if (ImmVal != -1) { SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32); return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val); } } } } // Scan through the operands to see if only one value is used. // // As an optimisation, even if more than one value is used it may be more // profitable to splat with one value then change some lanes. // // Heuristically we decide to do this if the vector has a "dominant" value, // defined as splatted to more than half of the lanes. unsigned NumElts = VT.getVectorNumElements(); bool isOnlyLowElement = true; bool usesOnlyOneValue = true; bool hasDominantValue = false; bool isConstant = true; // Map of the number of times a particular SDValue appears in the // element list. DenseMap ValueCounts; SDValue Value; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; if (i > 0) isOnlyLowElement = false; if (!isa(V) && !isa(V)) isConstant = false; ValueCounts.insert(std::make_pair(V, 0)); unsigned &Count = ValueCounts[V]; // Is this value dominant? (takes up more than half of the lanes) if (++Count > (NumElts / 2)) { hasDominantValue = true; Value = V; } } if (ValueCounts.size() != 1) usesOnlyOneValue = false; if (!Value.getNode() && !ValueCounts.empty()) Value = ValueCounts.begin()->first; if (ValueCounts.empty()) return DAG.getUNDEF(VT); // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR. // Keep going if we are hitting this case. if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode())) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); unsigned EltSize = VT.getScalarSizeInBits(); // Use VDUP for non-constant splats. For f32 constant splats, reduce to // i32 and try again. if (hasDominantValue && EltSize <= 32) { if (!isConstant) { SDValue N; // If we are VDUPing a value that comes directly from a vector, that will // cause an unnecessary move to and from a GPR, where instead we could // just use VDUPLANE. We can only do this if the lane being extracted // is at a constant index, as the VDUP from lane instructions only have // constant-index forms. ConstantSDNode *constIndex; if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT && (constIndex = dyn_cast(Value->getOperand(1)))) { // We need to create a new undef vector to use for the VDUPLANE if the // size of the vector from which we get the value is different than the // size of the vector that we need to create. We will insert the element // such that the register coalescer will remove unnecessary copies. if (VT != Value->getOperand(0).getValueType()) { unsigned index = constIndex->getAPIntValue().getLimitedValue() % VT.getVectorNumElements(); N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT), Value, DAG.getConstant(index, dl, MVT::i32)), DAG.getConstant(index, dl, MVT::i32)); } else N = DAG.getNode(ARMISD::VDUPLANE, dl, VT, Value->getOperand(0), Value->getOperand(1)); } else N = DAG.getNode(ARMISD::VDUP, dl, VT, Value); if (!usesOnlyOneValue) { // The dominant value was splatted as 'N', but we now have to insert // all differing elements. for (unsigned I = 0; I < NumElts; ++I) { if (Op.getOperand(I) == Value) continue; SmallVector Ops; Ops.push_back(N); Ops.push_back(Op.getOperand(I)); Ops.push_back(DAG.getConstant(I, dl, MVT::i32)); N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops); } } return N; } if (VT.getVectorElementType().isFloatingPoint()) { SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op.getOperand(i))); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); SDValue Val = DAG.getBuildVector(VecVT, dl, Ops); Val = LowerBUILD_VECTOR(Val, DAG, ST); if (Val.getNode()) return DAG.getNode(ISD::BITCAST, dl, VT, Val); } if (usesOnlyOneValue) { SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl); if (isConstant && Val.getNode()) return DAG.getNode(ARMISD::VDUP, dl, VT, Val); } } // If all elements are constants and the case above didn't get hit, fall back // to the default expansion, which will generate a load from the constant // pool. if (isConstant) return SDValue(); // Empirical tests suggest this is rarely worth it for vectors of length <= 2. if (NumElts >= 4) { SDValue shuffle = ReconstructShuffle(Op, DAG); if (shuffle != SDValue()) return shuffle; } if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) { // If we haven't found an efficient lowering, try splitting a 128-bit vector // into two 64-bit vectors; we might discover a better way to lower it. SmallVector Ops(Op->op_begin(), Op->op_begin() + NumElts); EVT ExtVT = VT.getVectorElementType(); EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2); SDValue Lower = DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2)); if (Lower.getOpcode() == ISD::BUILD_VECTOR) Lower = LowerBUILD_VECTOR(Lower, DAG, ST); SDValue Upper = DAG.getBuildVector( HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2)); if (Upper.getOpcode() == ISD::BUILD_VECTOR) Upper = LowerBUILD_VECTOR(Upper, DAG, ST); if (Lower && Upper) return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper); } // Vectors with 32- or 64-bit elements can be built by directly assigning // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands // will be legalized. if (EltSize >= 32) { // Do the expansion with floating-point types, since that is what the VFP // registers are defined to use, and since i64 is not legal. EVT EltVT = EVT::getFloatingPointVT(EltSize); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i))); SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, Val); } // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's // scalar_to_vector for the elements followed by a shuffle (provided the // shuffle is valid for the target) and materialization element by element // on the stack followed by a load for everything else. if (!isConstant && !usesOnlyOneValue) { SDValue Vec = DAG.getUNDEF(VT); for (unsigned i = 0 ; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); } return Vec; } return SDValue(); } // Gather data to see if the operation can be modelled as a // shuffle in combination with VEXTs. SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); SDLoc dl(Op); EVT VT = Op.getValueType(); unsigned NumElts = VT.getVectorNumElements(); struct ShuffleSourceInfo { SDValue Vec; unsigned MinElt = std::numeric_limits::max(); unsigned MaxElt = 0; // We may insert some combination of BITCASTs and VEXT nodes to force Vec to // be compatible with the shuffle we intend to construct. As a result // ShuffleVec will be some sliding window into the original Vec. SDValue ShuffleVec; // Code should guarantee that element i in Vec starts at element "WindowBase // + i * WindowScale in ShuffleVec". int WindowBase = 0; int WindowScale = 1; ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {} bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } }; // First gather all vectors used as an immediate source for this BUILD_VECTOR // node. SmallVector Sources; for (unsigned i = 0; i < NumElts; ++i) { SDValue V = Op.getOperand(i); if (V.isUndef()) continue; else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { // A shuffle can only come from building a vector from various // elements of other vectors. return SDValue(); } else if (!isa(V.getOperand(1))) { // Furthermore, shuffles require a constant mask, whereas extractelts // accept variable indices. return SDValue(); } // Add this element source to the list if it's not already there. SDValue SourceVec = V.getOperand(0); auto Source = llvm::find(Sources, SourceVec); if (Source == Sources.end()) Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); // Update the minimum and maximum lane number seen. unsigned EltNo = cast(V.getOperand(1))->getZExtValue(); Source->MinElt = std::min(Source->MinElt, EltNo); Source->MaxElt = std::max(Source->MaxElt, EltNo); } // Currently only do something sane when at most two source vectors // are involved. if (Sources.size() > 2) return SDValue(); // Find out the smallest element size among result and two sources, and use // it as element size to build the shuffle_vector. EVT SmallestEltTy = VT.getVectorElementType(); for (auto &Source : Sources) { EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); if (SrcEltTy.bitsLT(SmallestEltTy)) SmallestEltTy = SrcEltTy; } unsigned ResMultiplier = VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits(); NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); // If the source vector is too wide or too narrow, we may nevertheless be able // to construct a compatible shuffle either by concatenating it with UNDEF or // extracting a suitable range of elements. for (auto &Src : Sources) { EVT SrcVT = Src.ShuffleVec.getValueType(); if (SrcVT.getSizeInBits() == VT.getSizeInBits()) continue; // This stage of the search produces a source with the same element type as // the original, but with a total width matching the BUILD_VECTOR output. EVT EltVT = SrcVT.getVectorElementType(); unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits()) return SDValue(); // We can pad out the smaller vector for free, so if it's part of a // shuffle... Src.ShuffleVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, DAG.getUNDEF(Src.ShuffleVec.getValueType())); continue; } if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits()) return SDValue(); if (Src.MaxElt - Src.MinElt >= NumSrcElts) { // Span too large for a VEXT to cope return SDValue(); } if (Src.MinElt >= NumSrcElts) { // The extraction can just take the second half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i32)); Src.WindowBase = -NumSrcElts; } else if (Src.MaxElt < NumSrcElts) { // The extraction can just take the first half Src.ShuffleVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i32)); } else { // An actual VEXT is needed SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(0, dl, MVT::i32)); SDValue VEXTSrc2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, DAG.getConstant(NumSrcElts, dl, MVT::i32)); Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1, VEXTSrc2, DAG.getConstant(Src.MinElt, dl, MVT::i32)); Src.WindowBase = -Src.MinElt; } } // Another possible incompatibility occurs from the vector element types. We // can fix this by bitcasting the source vectors to the same type we intend // for the shuffle. for (auto &Src : Sources) { EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); if (SrcEltTy == SmallestEltTy) continue; assert(ShuffleVT.getVectorElementType() == SmallestEltTy); Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); Src.WindowBase *= Src.WindowScale; } // Final sanity check before we try to actually produce a shuffle. LLVM_DEBUG(for (auto Src : Sources) assert(Src.ShuffleVec.getValueType() == ShuffleVT);); // The stars all align, our next step is to produce the mask for the shuffle. SmallVector Mask(ShuffleVT.getVectorNumElements(), -1); int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits(); for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { SDValue Entry = Op.getOperand(i); if (Entry.isUndef()) continue; auto Src = llvm::find(Sources, Entry.getOperand(0)); int EltNo = cast(Entry.getOperand(1))->getSExtValue(); // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit // trunc. So only std::min(SrcBits, DestBits) actually get defined in this // segment. EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); int BitsDefined = std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits()); int LanesDefined = BitsDefined / BitsPerShuffleLane; // This source is expected to fill ResMultiplier lanes of the final shuffle, // starting at the appropriate offset. int *LaneMask = &Mask[i * ResMultiplier]; int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; ExtractBase += NumElts * (Src - Sources.begin()); for (int j = 0; j < LanesDefined; ++j) LaneMask[j] = ExtractBase + j; } // Final check before we try to produce nonsense... if (!isShuffleMaskLegal(Mask, ShuffleVT)) return SDValue(); // We can't handle more than two sources. This should have already // been checked before this point. assert(Sources.size() <= 2 && "Too many sources!"); SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; for (unsigned i = 0; i < Sources.size(); ++i) ShuffleOps[i] = Sources[i].ShuffleVec; SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], ShuffleOps[1], Mask); return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); } /// isShuffleMaskLegal - Targets can use this to indicate that they only /// support *some* VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (M[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = M[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) return true; } bool ReverseVEXT, isV_UNDEF; unsigned Imm, WhichResult; unsigned EltSize = VT.getScalarSizeInBits(); return (EltSize >= 32 || ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16) || isVEXTMask(M, VT, ReverseVEXT, Imm) || isVTBLMask(M, VT) || isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) || ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT))); } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit /// the specified operations to build the shuffle. static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl) { unsigned OpNum = (PFEntry >> 26) & 0x0F; unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1); unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1); enum { OP_COPY = 0, // Copy, used for things like to say it is <0,1,2,3> OP_VREV, OP_VDUP0, OP_VDUP1, OP_VDUP2, OP_VDUP3, OP_VEXT1, OP_VEXT2, OP_VEXT3, OP_VUZPL, // VUZP, left result OP_VUZPR, // VUZP, right result OP_VZIPL, // VZIP, left result OP_VZIPR, // VZIP, right result OP_VTRNL, // VTRN, left result OP_VTRNR // VTRN, right result }; if (OpNum == OP_COPY) { if (LHSID == (1*9+2)*9+3) return LHS; assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!"); return RHS; } SDValue OpLHS, OpRHS; OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); EVT VT = OpLHS.getValueType(); switch (OpNum) { default: llvm_unreachable("Unknown shuffle opcode!"); case OP_VREV: // VREV divides the vector in half and swaps within the half. if (VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::f32) return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS); // vrev <4 x i16> -> VREV32 if (VT.getVectorElementType() == MVT::i16) return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS); // vrev <4 x i8> -> VREV16 assert(VT.getVectorElementType() == MVT::i8); return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS); case OP_VDUP0: case OP_VDUP1: case OP_VDUP2: case OP_VDUP3: return DAG.getNode(ARMISD::VDUPLANE, dl, VT, OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32)); case OP_VEXT1: case OP_VEXT2: case OP_VEXT3: return DAG.getNode(ARMISD::VEXT, dl, VT, OpLHS, OpRHS, DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32)); case OP_VUZPL: case OP_VUZPR: return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS).getValue(OpNum-OP_VUZPL); case OP_VZIPL: case OP_VZIPR: return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS).getValue(OpNum-OP_VZIPL); case OP_VTRNL: case OP_VTRNR: return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS).getValue(OpNum-OP_VTRNL); } } static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef ShuffleMask, SelectionDAG &DAG) { // Check to see if we can use the VTBL instruction. SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc DL(Op); SmallVector VTBLMask; for (ArrayRef::iterator I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I) VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32)); if (V2.getNode()->isUndef()) return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1, DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2, DAG.getBuildVector(MVT::v8i8, DL, VTBLMask)); } static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); SDValue OpLHS = Op.getOperand(0); EVT VT = OpLHS.getValueType(); assert((VT == MVT::v8i16 || VT == MVT::v16i8) && "Expect an v8i16/v16i8 type"); OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS); // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now, // extract the first 8 bytes into the top double word and the last 8 bytes // into the bottom double word. The v8i16 case is similar. unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4; return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS, DAG.getConstant(ExtractNum, DL, MVT::i32)); } static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); SDLoc dl(Op); EVT VT = Op.getValueType(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); // Convert shuffles that are directly supported on NEON to target-specific // DAG nodes, instead of keeping them as shuffles and matching them again // during code selection. This is more efficient and avoids the possibility // of inconsistencies between legalization and selection. // FIXME: floating-point vectors should be canonicalized to integer vectors // of the same time so that they get CSEd properly. ArrayRef ShuffleMask = SVN->getMask(); unsigned EltSize = VT.getScalarSizeInBits(); if (EltSize <= 32) { if (SVN->isSplat()) { int Lane = SVN->getSplatIndex(); // If this is undef splat, generate it via "just" vdup, if possible. if (Lane == -1) Lane = 0; // Test if V1 is a SCALAR_TO_VECTOR. if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) { return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); } // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR // (and probably will turn into a SCALAR_TO_VECTOR once legalization // reaches it). if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR && !isa(V1.getOperand(0))) { bool IsScalarToVector = true; for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i) if (!V1.getOperand(i).isUndef()) { IsScalarToVector = false; break; } if (IsScalarToVector) return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0)); } return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i32)); } bool ReverseVEXT; unsigned Imm; if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { if (ReverseVEXT) std::swap(V1, V2); return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, DAG.getConstant(Imm, dl, MVT::i32)); } if (isVREVMask(ShuffleMask, VT, 64)) return DAG.getNode(ARMISD::VREV64, dl, VT, V1); if (isVREVMask(ShuffleMask, VT, 32)) return DAG.getNode(ARMISD::VREV32, dl, VT, V1); if (isVREVMask(ShuffleMask, VT, 16)) return DAG.getNode(ARMISD::VREV16, dl, VT, V1); if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) { return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1, DAG.getConstant(Imm, dl, MVT::i32)); } // Check for Neon shuffles that modify both input vectors in place. // If both results are used, i.e., if there are two shuffles with the same // source operands and with masks corresponding to both results of one of // these operations, DAG memoization will ensure that a single node is // used for both shuffles. unsigned WhichResult; bool isV_UNDEF; if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( ShuffleMask, VT, WhichResult, isV_UNDEF)) { if (isV_UNDEF) V2 = V1; return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2) .getValue(WhichResult); } // Also check for these shuffles through CONCAT_VECTORS: we canonicalize // shuffles that produce a result larger than their operands with: // shuffle(concat(v1, undef), concat(v2, undef)) // -> // shuffle(concat(v1, v2), undef) // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine). // // This is useful in the general case, but there are special cases where // native shuffles produce larger results: the two-result ops. // // Look through the concat when lowering them: // shuffle(concat(v1, v2), undef) // -> // concat(VZIP(v1, v2):0, :1) // if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) { SDValue SubV1 = V1->getOperand(0); SDValue SubV2 = V1->getOperand(1); EVT SubVT = SubV1.getValueType(); // We expect these to have been canonicalized to -1. assert(llvm::all_of(ShuffleMask, [&](int i) { return i < (int)VT.getVectorNumElements(); }) && "Unexpected shuffle index into UNDEF operand!"); if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask( ShuffleMask, SubVT, WhichResult, isV_UNDEF)) { if (isV_UNDEF) SubV2 = SubV1; assert((WhichResult == 0) && "In-place shuffle of concat can only have one result!"); SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT), SubV1, SubV2); return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0), Res.getValue(1)); } } } // If the shuffle is not directly supported and it has 4 elements, use // the PerfectShuffle-generated table to synthesize it from other shuffles. unsigned NumElts = VT.getVectorNumElements(); if (NumElts == 4) { unsigned PFIndexes[4]; for (unsigned i = 0; i != 4; ++i) { if (ShuffleMask[i] < 0) PFIndexes[i] = 8; else PFIndexes[i] = ShuffleMask[i]; } // Compute the index in the perfect shuffle table. unsigned PFTableIndex = PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3]; unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; unsigned Cost = (PFEntry >> 30); if (Cost <= 4) return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); } // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs. if (EltSize >= 32) { // Do the expansion with floating-point types, since that is what the VFP // registers are defined to use, and since i64 is not legal. EVT EltVT = EVT::getFloatingPointVT(EltSize); EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts); V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2); SmallVector Ops; for (unsigned i = 0; i < NumElts; ++i) { if (ShuffleMask[i] < 0) Ops.push_back(DAG.getUNDEF(EltVT)); else Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ShuffleMask[i] < (int)NumElts ? V1 : V2, DAG.getConstant(ShuffleMask[i] & (NumElts-1), dl, MVT::i32))); } SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, Val); } if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT)) return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG); if (VT == MVT::v8i8) if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG)) return NewOp; return SDValue(); } static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { // INSERT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(2); if (!isa(Lane)) return SDValue(); return Op; } static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) { // EXTRACT_VECTOR_ELT is legal only for immediate indexes. SDValue Lane = Op.getOperand(1); if (!isa(Lane)) return SDValue(); SDValue Vec = Op.getOperand(0); if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) { SDLoc dl(Op); return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane); } return Op; } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { // The only time a CONCAT_VECTORS operation can have legal types is when // two 64-bit vectors are concatenated to a 128-bit vector. assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 && "unexpected CONCAT_VECTORS"); SDLoc dl(Op); SDValue Val = DAG.getUNDEF(MVT::v2f64); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); if (!Op0.isUndef()) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0), DAG.getIntPtrConstant(0, dl)); if (!Op1.isUndef()) Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val, DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1), DAG.getIntPtrConstant(1, dl)); return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val); } /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each /// element has been zero/sign-extended, depending on the isSigned parameter, /// from an integer type half its size. static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, bool isSigned) { // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32. EVT VT = N->getValueType(0); if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) { SDNode *BVN = N->getOperand(0).getNode(); if (BVN->getValueType(0) != MVT::v4i32 || BVN->getOpcode() != ISD::BUILD_VECTOR) return false; unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; unsigned HiElt = 1 - LoElt; ConstantSDNode *Lo0 = dyn_cast(BVN->getOperand(LoElt)); ConstantSDNode *Hi0 = dyn_cast(BVN->getOperand(HiElt)); ConstantSDNode *Lo1 = dyn_cast(BVN->getOperand(LoElt+2)); ConstantSDNode *Hi1 = dyn_cast(BVN->getOperand(HiElt+2)); if (!Lo0 || !Hi0 || !Lo1 || !Hi1) return false; if (isSigned) { if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 && Hi1->getSExtValue() == Lo1->getSExtValue() >> 32) return true; } else { if (Hi0->isNullValue() && Hi1->isNullValue()) return true; } return false; } if (N->getOpcode() != ISD::BUILD_VECTOR) return false; for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { SDNode *Elt = N->getOperand(i).getNode(); if (ConstantSDNode *C = dyn_cast(Elt)) { unsigned EltSize = VT.getScalarSizeInBits(); unsigned HalfSize = EltSize / 2; if (isSigned) { if (!isIntN(HalfSize, C->getSExtValue())) return false; } else { if (!isUIntN(HalfSize, C->getZExtValue())) return false; } continue; } return false; } return true; } /// isSignExtended - Check if a node is a vector value that is sign-extended /// or a constant BUILD_VECTOR with sign-extended elements. static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N)) return true; if (isExtendedBUILD_VECTOR(N, DAG, true)) return true; return false; } /// isZeroExtended - Check if a node is a vector value that is zero-extended /// or a constant BUILD_VECTOR with zero-extended elements. static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N)) return true; if (isExtendedBUILD_VECTOR(N, DAG, false)) return true; return false; } static EVT getExtensionTo64Bits(const EVT &OrigVT) { if (OrigVT.getSizeInBits() >= 64) return OrigVT; assert(OrigVT.isSimple() && "Expecting a simple value type"); MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; switch (OrigSimpleTy) { default: llvm_unreachable("Unexpected Vector Type"); case MVT::v2i8: case MVT::v2i16: return MVT::v2i32; case MVT::v4i8: return MVT::v4i16; } } /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. /// We insert the required extension here to get the vector to fill a D register. static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode) { // The vector originally had a size of OrigTy. It was then extended to ExtTy. // We expect the ExtTy to be 128-bits total. If the OrigTy is less than // 64-bits we need to insert a new extension so that it will be 64-bits. assert(ExtTy.is128BitVector() && "Unexpected extension size"); if (OrigTy.getSizeInBits() >= 64) return N; // Must extend size to at least 64 bits to be used as an operand for VMULL. EVT NewVT = getExtensionTo64Bits(OrigTy); return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); } /// SkipLoadExtensionForVMULL - return a load of the original vector size that /// does not do any sign/zero extension. If the original vector is less /// than 64 bits, an appropriate extension will be added after the load to /// reach a total size of 64 bits. We have to add the extension separately /// because ARM does not have a sign/zero extending load for vectors. static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); // The load already has the right type. if (ExtendedTy == LD->getMemoryVT()) return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->getAlignment(), LD->getMemOperand()->getFlags()); // We need to create a zextload/sextload. We cannot just create a load // followed by a zext/zext node because LowerMUL is also run during normal // operation legalization where we can't create illegal types. return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->getMemoryVT(), LD->getAlignment(), LD->getMemOperand()->getFlags()); } /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, /// extending load, or BUILD_VECTOR with extended elements, return the /// unextended value. The unextended vector should be 64 bits so that it can /// be used as an operand to a VMULL instruction. If the original vector size /// before extension is less than 64 bits we add a an extension to resize /// the vector to 64 bits. static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) { if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) return AddRequiredExtensionForVMULL(N->getOperand(0), DAG, N->getOperand(0)->getValueType(0), N->getValueType(0), N->getOpcode()); if (LoadSDNode *LD = dyn_cast(N)) { assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) && "Expected extending load"); SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1)); unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; SDValue extLoad = DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad); return newLoad; } // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will // have been legalized as a BITCAST from v4i32. if (N->getOpcode() == ISD::BITCAST) { SDNode *BVN = N->getOperand(0).getNode(); assert(BVN->getOpcode() == ISD::BUILD_VECTOR && BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR"); unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0; return DAG.getBuildVector( MVT::v2i32, SDLoc(N), {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)}); } // Construct a new BUILD_VECTOR with elements truncated to half the size. assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); EVT VT = N->getValueType(0); unsigned EltSize = VT.getScalarSizeInBits() / 2; unsigned NumElts = VT.getVectorNumElements(); MVT TruncVT = MVT::getIntegerVT(EltSize); SmallVector Ops; SDLoc dl(N); for (unsigned i = 0; i != NumElts; ++i) { ConstantSDNode *C = cast(N->getOperand(i)); const APInt &CInt = C->getAPIntValue(); // Element types smaller than 32 bits are not legal, so use i32 elements. // The values are implicitly truncated so sext vs. zext doesn't matter. Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32)); } return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops); } static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isSignExtended(N0, DAG) && isSignExtended(N1, DAG); } return false; } static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { unsigned Opcode = N->getOpcode(); if (Opcode == ISD::ADD || Opcode == ISD::SUB) { SDNode *N0 = N->getOperand(0).getNode(); SDNode *N1 = N->getOperand(1).getNode(); return N0->hasOneUse() && N1->hasOneUse() && isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); } return false; } static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { // Multiplications are only custom-lowered for 128-bit vectors so that // VMULL can be detected. Otherwise v2i64 multiplications are not legal. EVT VT = Op.getValueType(); assert(VT.is128BitVector() && VT.isInteger() && "unexpected type for custom-lowering ISD::MUL"); SDNode *N0 = Op.getOperand(0).getNode(); SDNode *N1 = Op.getOperand(1).getNode(); unsigned NewOpc = 0; bool isMLA = false; bool isN0SExt = isSignExtended(N0, DAG); bool isN1SExt = isSignExtended(N1, DAG); if (isN0SExt && isN1SExt) NewOpc = ARMISD::VMULLs; else { bool isN0ZExt = isZeroExtended(N0, DAG); bool isN1ZExt = isZeroExtended(N1, DAG); if (isN0ZExt && isN1ZExt) NewOpc = ARMISD::VMULLu; else if (isN1SExt || isN1ZExt) { // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these // into (s/zext A * s/zext C) + (s/zext B * s/zext C) if (isN1SExt && isAddSubSExt(N0, DAG)) { NewOpc = ARMISD::VMULLs; isMLA = true; } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { NewOpc = ARMISD::VMULLu; isMLA = true; } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { std::swap(N0, N1); NewOpc = ARMISD::VMULLu; isMLA = true; } } if (!NewOpc) { if (VT == MVT::v2i64) // Fall through to expand this. It is not legal. return SDValue(); else // Other vector multiplications are legal. return Op; } } // Legalize to a VMULL instruction. SDLoc DL(Op); SDValue Op0; SDValue Op1 = SkipExtensionForVMULL(N1, DAG); if (!isMLA) { Op0 = SkipExtensionForVMULL(N0, DAG); assert(Op0.getValueType().is64BitVector() && Op1.getValueType().is64BitVector() && "unexpected types for extended operands to VMULL"); return DAG.getNode(NewOpc, DL, VT, Op0, Op1); } // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during // isel lowering to take advantage of no-stall back to back vmul + vmla. // vmull q0, d4, d6 // vmlal q0, d5, d6 // is faster than // vaddl q0, d4, d5 // vmovl q1, d6 // vmul q0, q0, q1 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG); SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG); EVT Op1VT = Op1.getValueType(); return DAG.getNode(N0->getOpcode(), DL, VT, DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), DAG.getNode(NewOpc, DL, VT, DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); } static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG) { // TODO: Should this propagate fast-math-flags? // Convert to float // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo)); // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo)); X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X); Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y); X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X); Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y); // Get reciprocal estimate. // float4 recip = vrecpeq_f32(yf); Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), Y); // Because char has a smaller range than uchar, we can actually get away // without any newton steps. This requires that we use a weird bias // of 0xb000, however (again, this has been exhaustively tested). // float4 result = as_float4(as_int4(xf*recip) + 0xb000); X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y); X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X); Y = DAG.getConstant(0xb000, dl, MVT::v4i32); X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y); X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X); // Convert back to short. X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X); X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X); return X; } static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG) { // TODO: Should this propagate fast-math-flags? SDValue N2; // Convert to float. // float4 yf = vcvt_f32_s32(vmovl_s16(y)); // float4 xf = vcvt_f32_s32(vmovl_s16(x)); N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0); N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1); N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); // Use reciprocal estimate and one refinement step. // float4 recip = vrecpeq_f32(yf); // recip *= vrecpsq_f32(yf, recip); N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), N1); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), N1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); // Because short has a smaller range than ushort, we can actually get away // with only a single newton step. This requires that we use a weird bias // of 89, however (again, this has been exhaustively tested). // float4 result = as_float4(as_int4(xf*recip) + 0x89); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); N1 = DAG.getConstant(0x89, dl, MVT::v4i32); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); // Convert back to integer and return. // return vmovn_s32(vcvt_s32_f32(result)); N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); return N0; } static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::SDIV"); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; if (VT == MVT::v8i8) { N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0); N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1); N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(4, dl)); N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(4, dl)); N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(0, dl)); N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(0, dl)); N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); N0 = LowerCONCAT_VECTORS(N0, DAG); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0); return N0; } return LowerSDIV_v4i16(N0, N1, dl, DAG); } static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) { // TODO: Should this propagate fast-math-flags? EVT VT = Op.getValueType(); assert((VT == MVT::v4i16 || VT == MVT::v8i8) && "unexpected type for custom-lowering ISD::UDIV"); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); SDValue N2, N3; if (VT == MVT::v8i8) { N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1); N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(4, dl)); N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(4, dl)); N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0, DAG.getIntPtrConstant(0, dl)); N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1, DAG.getIntPtrConstant(0, dl)); N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2); N0 = LowerCONCAT_VECTORS(N0, DAG); N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8, DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl, MVT::i32), N0); return N0; } // v4i16 sdiv ... Convert to float. // float4 yf = vcvt_f32_s32(vmovl_u16(y)); // float4 xf = vcvt_f32_s32(vmovl_u16(x)); N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0); N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1); N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0); SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1); // Use reciprocal estimate and two refinement steps. // float4 recip = vrecpeq_f32(yf); // recip *= vrecpsq_f32(yf, recip); // recip *= vrecpsq_f32(yf, recip); N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32), BN1); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), BN1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32, DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32), BN1, N2); N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2); // Simply multiplying by the reciprocal estimate can leave us a few ulps // too low, so we add 2 ulps (exhaustive testing shows that this is enough, // and that it will never cause us to return an answer too large). // float4 result = as_float4(as_int4(xf*recip) + 2); N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0); N1 = DAG.getConstant(2, dl, MVT::v4i32); N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1); N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0); // Convert back to integer and return. // return vmovn_u32(vcvt_s32_f32(result)); N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0); N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0); return N0; } static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) { SDNode *N = Op.getNode(); EVT VT = N->getValueType(0); SDVTList VTs = DAG.getVTList(VT, MVT::i32); SDValue Carry = Op.getOperand(2); SDLoc DL(Op); SDValue Result; if (Op.getOpcode() == ISD::ADDCARRY) { // This converts the boolean value carry into the carry flag. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); // Do the addition proper using the carry flag we wanted. Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0), Op.getOperand(1), Carry); // Now convert the carry flag into a boolean value. Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); } else { // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we // have to invert the carry first. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), Carry); // This converts the boolean value carry into the carry flag. Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG); // Do the subtraction proper using the carry flag we wanted. Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0), Op.getOperand(1), Carry); // Now convert the carry flag into a boolean value. Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG); // But the carry returned by ARMISD::SUBE is not a borrow as expected // by ISD::SUBCARRY, so compute 1 - C. Carry = DAG.getNode(ISD::SUB, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32), Carry); } // Return both values. return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry); } SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetDarwin()); // For iOS, we want to call an alternative entry point: __sincos_stret, // return values are passed via sret. SDLoc dl(Op); SDValue Arg = Op.getOperand(0); EVT ArgVT = Arg.getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); auto PtrVT = getPointerTy(DAG.getDataLayout()); MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. Type *RetTy = StructType::get(ArgTy, ArgTy); auto &DL = DAG.getDataLayout(); ArgListTy Args; bool ShouldUseSRet = Subtarget->isAPCS_ABI(); SDValue SRet; if (ShouldUseSRet) { // Create stack object for sret. const uint64_t ByteSize = DL.getTypeAllocSize(RetTy); const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy); int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false); SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL)); ArgListEntry Entry; Entry.Node = SRet; Entry.Ty = RetTy->getPointerTo(); Entry.IsSExt = false; Entry.IsZExt = false; Entry.IsSRet = true; Args.push_back(Entry); RetTy = Type::getVoidTy(*DAG.getContext()); } ArgListEntry Entry; Entry.Node = Arg; Entry.Ty = ArgTy; Entry.IsSExt = false; Entry.IsZExt = false; Args.push_back(Entry); RTLIB::Libcall LC = (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; const char *LibcallName = getLibcallName(LC); CallingConv::ID CC = getLibcallCallingConv(LC); SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL)); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) .setCallee(CC, RetTy, Callee, std::move(Args)) .setDiscardResult(ShouldUseSRet); std::pair CallResult = LowerCallTo(CLI); if (!ShouldUseSRet) return CallResult.first; SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo()); // Address of cos field. SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet, DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl)); SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo()); SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, LoadSin.getValue(0), LoadCos.getValue(0)); } SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const { EVT VT = Op.getValueType(); assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected type for custom lowering DIV"); SDLoc dl(Op); const auto &DL = DAG.getDataLayout(); const auto &TLI = DAG.getTargetLoweringInfo(); const char *Name = nullptr; if (Signed) Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64"; else Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64"; SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL)); ARMTargetLowering::ArgListTy Args; for (auto AI : {1, 0}) { ArgListEntry Arg; Arg.Node = Op.getOperand(AI); Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext()); Args.push_back(Arg); } CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(Chain) .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()), ES, std::move(Args)); return LowerCallTo(CLI).first; } SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const { assert(Op.getValueType() == MVT::i32 && "unexpected type for custom lowering DIV"); SDLoc dl(Op); SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other, DAG.getEntryNode(), Op.getOperand(1)); return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); } static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) { SDLoc DL(N); SDValue Op = N->getOperand(1); if (N->getValueType(0) == MVT::i32) return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, DAG.getConstant(0, DL, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op, DAG.getConstant(1, DL, MVT::i32)); return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi)); } void ARMTargetLowering::ExpandDIV_Windows( SDValue Op, SelectionDAG &DAG, bool Signed, SmallVectorImpl &Results) const { const auto &DL = DAG.getDataLayout(); const auto &TLI = DAG.getTargetLoweringInfo(); assert(Op.getValueType() == MVT::i64 && "unexpected type for custom lowering DIV"); SDLoc dl(Op); SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode()); SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK); SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result); SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result, DAG.getConstant(32, dl, TLI.getPointerTy(DL))); Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper); Results.push_back(Lower); Results.push_back(Upper); } static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) { if (isStrongerThanMonotonic(cast(Op)->getOrdering())) // Acquire/Release load/store is not legal for targets without a dmb or // equivalent available. return SDValue(); // Monotonic load/store is legal for all targets. return Op; } static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { SDLoc DL(N); // Under Power Management extensions, the cycle-count is: // mrc p15, #0, , c9, c13, #0 SDValue Ops[] = { N->getOperand(0), // Chain DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32), DAG.getConstant(15, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32), DAG.getConstant(9, DL, MVT::i32), DAG.getConstant(13, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i32) }; SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, DAG.getVTList(MVT::i32, MVT::Other), Ops); Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32, DAG.getConstant(0, DL, MVT::i32))); Results.push_back(Cycles32.getValue(1)); } static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) { SDLoc dl(V.getNode()); SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32); SDValue VHi = DAG.getAnyExtOrTrunc( DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)), dl, MVT::i32); bool isBigEndian = DAG.getDataLayout().isBigEndian(); if (isBigEndian) std::swap (VLo, VHi); SDValue RegClass = DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32); SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32); SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32); const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 }; return SDValue( DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0); } static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl & Results, SelectionDAG &DAG) { assert(N->getValueType(0) == MVT::i64 && "AtomicCmpSwap on types less than 64 should be legal"); SDValue Ops[] = {N->getOperand(1), createGPRPairNode(DAG, N->getOperand(2)), createGPRPairNode(DAG, N->getOperand(3)), N->getOperand(0)}; SDNode *CmpSwap = DAG.getMachineNode( ARM::CMP_SWAP_64, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops); MachineFunction &MF = DAG.getMachineFunction(); MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); MemOp[0] = cast(N)->getMemOperand(); cast(CmpSwap)->setMemRefs(MemOp, MemOp + 1); bool isBigEndian = DAG.getDataLayout().isBigEndian(); Results.push_back( DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0, SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); Results.push_back( DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1, SDLoc(N), MVT::i32, SDValue(CmpSwap, 0))); Results.push_back(SDValue(CmpSwap, 2)); } static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget, SelectionDAG &DAG) { const auto &TLI = DAG.getTargetLoweringInfo(); assert(Subtarget.getTargetTriple().isOSMSVCRT() && "Custom lowering is MSVCRT specific!"); SDLoc dl(Op); SDValue Val = Op.getOperand(0); MVT Ty = Val->getSimpleValueType(0); SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1)); SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow", TLI.getPointerTy(DAG.getDataLayout())); TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; Entry.Node = Val; Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext()); Entry.IsZExt = true; Args.push_back(Entry); Entry.Node = Exponent; Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext()); Entry.IsZExt = true; Args.push_back(Entry); Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext()); // In the in-chain to the call is the entry node If we are emitting a // tailcall, the chain will be mutated if the node has a non-entry input // chain. SDValue InChain = DAG.getEntryNode(); SDValue TCChain = InChain; const Function &F = DAG.getMachineFunction().getFunction(); bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) && F.getReturnType() == LCRTy; if (IsTC) InChain = TCChain; TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(InChain) .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args)) .setTailCall(IsTC); std::pair CI = TLI.LowerCallTo(CLI); // Return the chain (the DAG root) if it is a tail call return !CI.second.getNode() ? DAG.getRoot() : CI.first; } SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { default: llvm_unreachable("Don't know how to custom lower this!"); case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::SELECT: return LowerSELECT(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::BR_CC: return LowerBR_CC(Op, DAG); case ISD::BR_JT: return LowerBR_JT(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget); case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget); case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG); case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG); case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG, Subtarget); case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget); case ISD::SHL: case ISD::SRL: case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget); case ISD::SREM: return LowerREM(Op.getNode(), DAG); case ISD::UREM: return LowerREM(Op.getNode(), DAG); case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG); case ISD::SRL_PARTS: case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG); case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget); case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget); case ISD::SETCC: return LowerVSETCC(Op, DAG); case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget); case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG); case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); case ISD::SDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) return LowerDIV_Windows(Op, DAG, /* Signed */ true); return LowerSDIV(Op, DAG); case ISD::UDIV: if (Subtarget->isTargetWindows() && !Op.getValueType().isVector()) return LowerDIV_Windows(Op, DAG, /* Signed */ false); return LowerUDIV(Op, DAG); case ISD::ADDCARRY: case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG); case ISD::SADDO: case ISD::SSUBO: return LowerSignedALUO(Op, DAG); case ISD::UADDO: case ISD::USUBO: return LowerUnsignedALUO(Op, DAG); case ISD::ATOMIC_LOAD: case ISD::ATOMIC_STORE: return LowerAtomicLoadStore(Op, DAG); case ISD::FSINCOS: return LowerFSINCOS(Op, DAG); case ISD::SDIVREM: case ISD::UDIVREM: return LowerDivRem(Op, DAG); case ISD::DYNAMIC_STACKALLOC: if (Subtarget->isTargetWindows()) return LowerDYNAMIC_STACKALLOC(Op, DAG); llvm_unreachable("Don't know how to custom lower this!"); case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); } } static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); unsigned Opc = 0; if (IntNo == Intrinsic::arm_smlald) Opc = ARMISD::SMLALD; else if (IntNo == Intrinsic::arm_smlaldx) Opc = ARMISD::SMLALDX; else if (IntNo == Intrinsic::arm_smlsld) Opc = ARMISD::SMLSLD; else if (IntNo == Intrinsic::arm_smlsldx) Opc = ARMISD::SMLSLDX; else return; SDLoc dl(N); SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), DAG.getConstant(0, dl, MVT::i32)); SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3), DAG.getConstant(1, dl, MVT::i32)); SDValue LongMul = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::i32), N->getOperand(1), N->getOperand(2), Lo, Hi); Results.push_back(LongMul.getValue(0)); Results.push_back(LongMul.getValue(1)); } /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ARMTargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { SDValue Res; switch (N->getOpcode()) { default: llvm_unreachable("Don't know how to custom expand this!"); case ISD::READ_REGISTER: ExpandREAD_REGISTER(N, Results, DAG); break; case ISD::BITCAST: Res = ExpandBITCAST(N, DAG, Subtarget); break; case ISD::SRL: case ISD::SRA: Res = Expand64BitShift(N, DAG, Subtarget); break; case ISD::SREM: case ISD::UREM: Res = LowerREM(N, DAG); break; case ISD::SDIVREM: case ISD::UDIVREM: Res = LowerDivRem(SDValue(N, 0), DAG); assert(Res.getNumOperands() == 2 && "DivRem needs two values"); Results.push_back(Res.getValue(0)); Results.push_back(Res.getValue(1)); return; case ISD::READCYCLECOUNTER: ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget); return; case ISD::UDIV: case ISD::SDIV: assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows"); return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV, Results); case ISD::ATOMIC_CMP_SWAP: ReplaceCMP_SWAP_64Results(N, Results, DAG); return; case ISD::INTRINSIC_WO_CHAIN: return ReplaceLongIntrinsic(N, Results, DAG); } if (Res.getNode()) Results.push_back(Res); } //===----------------------------------------------------------------------===// // ARM Scheduler Hooks //===----------------------------------------------------------------------===// /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and /// registers the function context. void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const { assert(!Subtarget->isROPI() && !Subtarget->isRWPI() && "ROPI/RWPI not currently supported with SjLj"); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); MachineConstantPool *MCP = MF->getConstantPool(); ARMFunctionInfo *AFI = MF->getInfo(); const Function &F = MF->getFunction(); bool isThumb = Subtarget->isThumb(); bool isThumb2 = Subtarget->isThumb2(); unsigned PCLabelId = AFI->createPICLabelUId(); unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8; ARMConstantPoolValue *CPV = ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj); unsigned CPI = MCP->getConstantPoolIndex(CPV, 4); const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; // Grab constant pool and fixed stack memory operands. MachineMemOperand *CPMMO = MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF), MachineMemOperand::MOLoad, 4, 4); MachineMemOperand *FIMMOSt = MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4, 4); // Load the address of the dispatch MBB into the jump buffer. if (isThumb2) { // Incoming value: jbuf // ldr.n r5, LCPI1_1 // orr r5, r5, #1 // add r5, pc // str r5, [$jbuf, #+4] ; &jbuf[1] unsigned NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1) .addConstantPoolIndex(CPI) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); // Set the low bit because of thumb mode. unsigned NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(0x01) .add(predOps(ARMCC::AL)) .add(condCodeOp()); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3) .addReg(NewVReg2, RegState::Kill) .addImm(PCLabelId); BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12)) .addReg(NewVReg3, RegState::Kill) .addFrameIndex(FI) .addImm(36) // &jbuf[1] :: pc .addMemOperand(FIMMOSt) .add(predOps(ARMCC::AL)); } else if (isThumb) { // Incoming value: jbuf // ldr.n r1, LCPI1_4 // add r1, pc // mov r2, #1 // orrs r1, r2 // add r2, $jbuf, #+4 ; &jbuf[1] // str r1, [r2] unsigned NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1) .addConstantPoolIndex(CPI) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); unsigned NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId); // Set the low bit because of thumb mode. unsigned NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3) .addReg(ARM::CPSR, RegState::Define) .addImm(1) .add(predOps(ARMCC::AL)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3, RegState::Kill) .add(predOps(ARMCC::AL)); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5) .addFrameIndex(FI) .addImm(36); // &jbuf[1] :: pc BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi)) .addReg(NewVReg4, RegState::Kill) .addReg(NewVReg5, RegState::Kill) .addImm(0) .addMemOperand(FIMMOSt) .add(predOps(ARMCC::AL)); } else { // Incoming value: jbuf // ldr r1, LCPI1_1 // add r1, pc, r1 // str r1, [$jbuf, #+4] ; &jbuf[1] unsigned NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1) .addConstantPoolIndex(CPI) .addImm(0) .addMemOperand(CPMMO) .add(predOps(ARMCC::AL)); unsigned NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2) .addReg(NewVReg1, RegState::Kill) .addImm(PCLabelId) .add(predOps(ARMCC::AL)); BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12)) .addReg(NewVReg2, RegState::Kill) .addFrameIndex(FI) .addImm(36) // &jbuf[1] :: pc .addMemOperand(FIMMOSt) .add(predOps(ARMCC::AL)); } } void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); MachineRegisterInfo *MRI = &MF->getRegInfo(); MachineFrameInfo &MFI = MF->getFrameInfo(); int FI = MFI.getFunctionContextIndex(); const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass : &ARM::GPRnopcRegClass; // Get a mapping of the call site numbers to all of the landing pads they're // associated with. DenseMap> CallSiteNumToLPad; unsigned MaxCSNum = 0; for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E; ++BB) { if (!BB->isEHPad()) continue; // FIXME: We should assert that the EH_LABEL is the first MI in the landing // pad. for (MachineBasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { if (!II->isEHLabel()) continue; MCSymbol *Sym = II->getOperand(0).getMCSymbol(); if (!MF->hasCallSiteLandingPad(Sym)) continue; SmallVectorImpl &CallSiteIdxs = MF->getCallSiteLandingPad(Sym); for (SmallVectorImpl::iterator CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end(); CSI != CSE; ++CSI) { CallSiteNumToLPad[*CSI].push_back(&*BB); MaxCSNum = std::max(MaxCSNum, *CSI); } break; } } // Get an ordered list of the machine basic blocks for the jump table. std::vector LPadList; SmallPtrSet InvokeBBs; LPadList.reserve(CallSiteNumToLPad.size()); for (unsigned I = 1; I <= MaxCSNum; ++I) { SmallVectorImpl &MBBList = CallSiteNumToLPad[I]; for (SmallVectorImpl::iterator II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) { LPadList.push_back(*II); InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end()); } } assert(!LPadList.empty() && "No landing pad destinations for the dispatch jump table!"); // Create the jump table and associated information. MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline); unsigned MJTI = JTI->createJumpTableIndex(LPadList); // Create the MBBs for the dispatch code. // Shove the dispatch's address into the return slot in the function context. MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); DispatchBB->setIsEHPad(); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); unsigned trap_opcode; if (Subtarget->isThumb()) trap_opcode = ARM::tTRAP; else trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP; BuildMI(TrapBB, dl, TII->get(trap_opcode)); DispatchBB->addSuccessor(TrapBB); MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); DispatchBB->addSuccessor(DispContBB); // Insert and MBBs. MF->insert(MF->end(), DispatchBB); MF->insert(MF->end(), DispContBB); MF->insert(MF->end(), TrapBB); // Insert code into the entry block that creates and registers the function // context. SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI); MachineMemOperand *FIMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4); MachineInstrBuilder MIB; MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup)); const ARMBaseInstrInfo *AII = static_cast(TII); const ARMBaseRegisterInfo &RI = AII->getRegisterInfo(); // Add a register mask with no preserved registers. This results in all // registers being marked as clobbered. This can't work if the dispatch block // is in a Thumb1 function and is linked with ARM code which uses the FP // registers, as there is no way to preserve the FP registers in Thumb1 mode. MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF)); bool IsPositionIndependent = isPositionIndependent(); unsigned NumLPads = LPadList.size(); if (Subtarget->isThumb2()) { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1) .addFrameIndex(FI) .addImm(4) .addMemOperand(FIMMOLd) .add(predOps(ARMCC::AL)); if (NumLPads < 256) { BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri)) .addReg(NewVReg1) .addImm(LPadList.size()) .add(predOps(ARMCC::AL)); } else { unsigned VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1) .addImm(NumLPads & 0xFFFF) .add(predOps(ARMCC::AL)); unsigned VReg2 = VReg1; if ((NumLPads & 0xFFFF0000) != 0) { VReg2 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2) .addReg(VReg1) .addImm(NumLPads >> 16) .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr)) .addReg(NewVReg1) .addReg(VReg2) .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc)) .addMBB(TrapBB) .addImm(ARMCC::HI) .addReg(ARM::CPSR); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg1) .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) .add(predOps(ARMCC::AL)) .add(condCodeOp()); BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT)) .addReg(NewVReg4, RegState::Kill) .addReg(NewVReg1) .addJumpTableIndex(MJTI); } else if (Subtarget->isThumb()) { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1) .addFrameIndex(FI) .addImm(1) .addMemOperand(FIMMOLd) .add(predOps(ARMCC::AL)); if (NumLPads < 256) { BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8)) .addReg(NewVReg1) .addImm(NumLPads) .add(predOps(ARMCC::AL)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci)) .addReg(VReg1, RegState::Define) .addConstantPoolIndex(Idx) .add(predOps(ARMCC::AL)); BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr)) .addReg(NewVReg1) .addReg(VReg1) .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::tBcc)) .addMBB(TrapBB) .addImm(ARMCC::HI) .addReg(ARM::CPSR); unsigned NewVReg2 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg1) .addImm(2) .add(predOps(ARMCC::AL)); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg2, RegState::Kill) .addReg(NewVReg3) .add(predOps(ARMCC::AL)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5) .addReg(NewVReg4, RegState::Kill) .addImm(0) .addMemOperand(JTMMOLd) .add(predOps(ARMCC::AL)); unsigned NewVReg6 = NewVReg5; if (IsPositionIndependent) { NewVReg6 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6) .addReg(ARM::CPSR, RegState::Define) .addReg(NewVReg5, RegState::Kill) .addReg(NewVReg3) .add(predOps(ARMCC::AL)); } BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr)) .addReg(NewVReg6, RegState::Kill) .addJumpTableIndex(MJTI); } else { unsigned NewVReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1) .addFrameIndex(FI) .addImm(4) .addMemOperand(FIMMOLd) .add(predOps(ARMCC::AL)); if (NumLPads < 256) { BuildMI(DispatchBB, dl, TII->get(ARM::CMPri)) .addReg(NewVReg1) .addImm(NumLPads) .add(predOps(ARMCC::AL)); } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) { unsigned VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1) .addImm(NumLPads & 0xFFFF) .add(predOps(ARMCC::AL)); unsigned VReg2 = VReg1; if ((NumLPads & 0xFFFF0000) != 0) { VReg2 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2) .addReg(VReg1) .addImm(NumLPads >> 16) .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) .addReg(NewVReg1) .addReg(VReg2) .add(predOps(ARMCC::AL)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); const Constant *C = ConstantInt::get(Int32Ty, NumLPads); // MachineConstantPool wants an explicit alignment. unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); unsigned VReg1 = MRI->createVirtualRegister(TRC); BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp)) .addReg(VReg1, RegState::Define) .addConstantPoolIndex(Idx) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr)) .addReg(NewVReg1) .addReg(VReg1, RegState::Kill) .add(predOps(ARMCC::AL)); } BuildMI(DispatchBB, dl, TII->get(ARM::Bcc)) .addMBB(TrapBB) .addImm(ARMCC::HI) .addReg(ARM::CPSR); unsigned NewVReg3 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3) .addReg(NewVReg1) .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2)) .add(predOps(ARMCC::AL)) .add(condCodeOp()); unsigned NewVReg4 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4) .addJumpTableIndex(MJTI) .add(predOps(ARMCC::AL)); MachineMemOperand *JTMMOLd = MF->getMachineMemOperand( MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4); unsigned NewVReg5 = MRI->createVirtualRegister(TRC); BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5) .addReg(NewVReg3, RegState::Kill) .addReg(NewVReg4) .addImm(0) .addMemOperand(JTMMOLd) .add(predOps(ARMCC::AL)); if (IsPositionIndependent) { BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd)) .addReg(NewVReg5, RegState::Kill) .addReg(NewVReg4) .addJumpTableIndex(MJTI); } else { BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr)) .addReg(NewVReg5, RegState::Kill) .addJumpTableIndex(MJTI); } } // Add the jump table entries as successors to the MBB. SmallPtrSet SeenMBBs; for (std::vector::iterator I = LPadList.begin(), E = LPadList.end(); I != E; ++I) { MachineBasicBlock *CurMBB = *I; if (SeenMBBs.insert(CurMBB).second) DispContBB->addSuccessor(CurMBB); } // N.B. the order the invoke BBs are processed in doesn't matter here. const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF); SmallVector MBBLPads; for (MachineBasicBlock *BB : InvokeBBs) { // Remove the landing pad successor from the invoke block and replace it // with the new dispatch block. SmallVector Successors(BB->succ_begin(), BB->succ_end()); while (!Successors.empty()) { MachineBasicBlock *SMBB = Successors.pop_back_val(); if (SMBB->isEHPad()) { BB->removeSuccessor(SMBB); MBBLPads.push_back(SMBB); } } BB->addSuccessor(DispatchBB, BranchProbability::getZero()); BB->normalizeSuccProbs(); // Find the invoke call and mark all of the callee-saved registers as // 'implicit defined' so that they're spilled. This prevents code from // moving instructions to before the EH block, where they will never be // executed. for (MachineBasicBlock::reverse_iterator II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) { if (!II->isCall()) continue; DenseMap DefRegs; for (MachineInstr::mop_iterator OI = II->operands_begin(), OE = II->operands_end(); OI != OE; ++OI) { if (!OI->isReg()) continue; DefRegs[OI->getReg()] = true; } MachineInstrBuilder MIB(*MF, &*II); for (unsigned i = 0; SavedRegs[i] != 0; ++i) { unsigned Reg = SavedRegs[i]; if (Subtarget->isThumb2() && !ARM::tGPRRegClass.contains(Reg) && !ARM::hGPRRegClass.contains(Reg)) continue; if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg)) continue; if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg)) continue; if (!DefRegs[Reg]) MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); } break; } } // Mark all former landing pads as non-landing pads. The dispatch is the only // landing pad now. for (SmallVectorImpl::iterator I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I) (*I)->setIsEHPad(false); // The instruction is gone now. MI.eraseFromParent(); } static MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) { for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) if (*I != Succ) return *I; llvm_unreachable("Expecting a BB with two successors!"); } /// Return the load opcode for a given load size. If load size >= 8, /// neon opcode will be returned. static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) { if (LdSize >= 8) return LdSize == 16 ? ARM::VLD1q32wb_fixed : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0; if (IsThumb1) return LdSize == 4 ? ARM::tLDRi : LdSize == 2 ? ARM::tLDRHi : LdSize == 1 ? ARM::tLDRBi : 0; if (IsThumb2) return LdSize == 4 ? ARM::t2LDR_POST : LdSize == 2 ? ARM::t2LDRH_POST : LdSize == 1 ? ARM::t2LDRB_POST : 0; return LdSize == 4 ? ARM::LDR_POST_IMM : LdSize == 2 ? ARM::LDRH_POST : LdSize == 1 ? ARM::LDRB_POST_IMM : 0; } /// Return the store opcode for a given store size. If store size >= 8, /// neon opcode will be returned. static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) { if (StSize >= 8) return StSize == 16 ? ARM::VST1q32wb_fixed : StSize == 8 ? ARM::VST1d32wb_fixed : 0; if (IsThumb1) return StSize == 4 ? ARM::tSTRi : StSize == 2 ? ARM::tSTRHi : StSize == 1 ? ARM::tSTRBi : 0; if (IsThumb2) return StSize == 4 ? ARM::t2STR_POST : StSize == 2 ? ARM::t2STRH_POST : StSize == 1 ? ARM::t2STRB_POST : 0; return StSize == 4 ? ARM::STR_POST_IMM : StSize == 2 ? ARM::STRH_POST : StSize == 1 ? ARM::STRB_POST_IMM : 0; } /// Emit a post-increment load operation with given size. The instructions /// will be added to BB at Pos. static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2) { unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2); assert(LdOpc != 0 && "Should have a load opcode"); if (LdSize >= 8) { BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrOut, RegState::Define) .addReg(AddrIn) .addImm(0) .add(predOps(ARMCC::AL)); } else if (IsThumb1) { // load + update AddrIn BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrIn) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) .add(t1CondCodeOp()) .addReg(AddrIn) .addImm(LdSize) .add(predOps(ARMCC::AL)); } else if (IsThumb2) { BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrOut, RegState::Define) .addReg(AddrIn) .addImm(LdSize) .add(predOps(ARMCC::AL)); } else { // arm BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data) .addReg(AddrOut, RegState::Define) .addReg(AddrIn) .addReg(0) .addImm(LdSize) .add(predOps(ARMCC::AL)); } } /// Emit a post-increment store operation with given size. The instructions /// will be added to BB at Pos. static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2) { unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2); assert(StOpc != 0 && "Should have a store opcode"); if (StSize >= 8) { BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) .addReg(AddrIn) .addImm(0) .addReg(Data) .add(predOps(ARMCC::AL)); } else if (IsThumb1) { // store + update AddrIn BuildMI(*BB, Pos, dl, TII->get(StOpc)) .addReg(Data) .addReg(AddrIn) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut) .add(t1CondCodeOp()) .addReg(AddrIn) .addImm(StSize) .add(predOps(ARMCC::AL)); } else if (IsThumb2) { BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) .addReg(Data) .addReg(AddrIn) .addImm(StSize) .add(predOps(ARMCC::AL)); } else { // arm BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut) .addReg(Data) .addReg(AddrIn) .addReg(0) .addImm(StSize) .add(predOps(ARMCC::AL)); } } MachineBasicBlock * ARMTargetLowering::EmitStructByval(MachineInstr &MI, MachineBasicBlock *BB) const { // This pseudo instruction has 3 operands: dst, src, size // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold(). // Otherwise, we will generate unrolled scalar copies. const TargetInstrInfo *TII = Subtarget->getInstrInfo(); const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); unsigned dest = MI.getOperand(0).getReg(); unsigned src = MI.getOperand(1).getReg(); unsigned SizeVal = MI.getOperand(2).getImm(); unsigned Align = MI.getOperand(3).getImm(); DebugLoc dl = MI.getDebugLoc(); MachineFunction *MF = BB->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); unsigned UnitSize = 0; const TargetRegisterClass *TRC = nullptr; const TargetRegisterClass *VecTRC = nullptr; bool IsThumb1 = Subtarget->isThumb1Only(); bool IsThumb2 = Subtarget->isThumb2(); bool IsThumb = Subtarget->isThumb(); if (Align & 1) { UnitSize = 1; } else if (Align & 2) { UnitSize = 2; } else { // Check whether we can use NEON instructions. if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) && Subtarget->hasNEON()) { if ((Align % 16 == 0) && SizeVal >= 16) UnitSize = 16; else if ((Align % 8 == 0) && SizeVal >= 8) UnitSize = 8; } // Can't use NEON instructions. if (UnitSize == 0) UnitSize = 4; } // Select the correct opcode and register class for unit size load/store bool IsNeon = UnitSize >= 8; TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass; if (IsNeon) VecTRC = UnitSize == 16 ? &ARM::DPairRegClass : UnitSize == 8 ? &ARM::DPRRegClass : nullptr; unsigned BytesLeft = SizeVal % UnitSize; unsigned LoopSize = SizeVal - BytesLeft; if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) { // Use LDR and STR to copy. // [scratch, srcOut] = LDR_POST(srcIn, UnitSize) // [destOut] = STR_POST(scratch, destIn, UnitSize) unsigned srcIn = src; unsigned destIn = dest; for (unsigned i = 0; i < LoopSize; i+=UnitSize) { unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut, IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } // Handle the leftover bytes with LDRB and STRB. // [scratch, srcOut] = LDRB_POST(srcIn, 1) // [destOut] = STRB_POST(scratch, destIn, 1) for (unsigned i = 0; i < BytesLeft; i++) { unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); unsigned scratch = MRI.createVirtualRegister(TRC); emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut, IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } MI.eraseFromParent(); // The instruction is gone now. return BB; } // Expand the pseudo op to a loop. // thisMBB: // ... // movw varEnd, # --> with thumb2 // movt varEnd, # // ldrcp varEnd, idx --> without thumb2 // fallthrough --> loopMBB // loopMBB: // PHI varPhi, varEnd, varLoop // PHI srcPhi, src, srcLoop // PHI destPhi, dst, destLoop // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) // [destLoop] = STR_POST(scratch, destPhi, UnitSize) // subs varLoop, varPhi, #UnitSize // bne loopMBB // fallthrough --> exitMBB // exitMBB: // epilogue to handle left-over bytes // [scratch, srcOut] = LDRB_POST(srcLoop, 1) // [destOut] = STRB_POST(scratch, destLoop, 1) MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB); MF->insert(It, loopMBB); MF->insert(It, exitMBB); // Transfer the remainder of BB and its successor edges to exitMBB. exitMBB->splice(exitMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); exitMBB->transferSuccessorsAndUpdatePHIs(BB); // Load an immediate to varEnd. unsigned varEnd = MRI.createVirtualRegister(TRC); if (Subtarget->useMovt(*MF)) { unsigned Vtmp = varEnd; if ((LoopSize & 0xFFFF0000) != 0) Vtmp = MRI.createVirtualRegister(TRC); BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp) .addImm(LoopSize & 0xFFFF) .add(predOps(ARMCC::AL)); if ((LoopSize & 0xFFFF0000) != 0) BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd) .addReg(Vtmp) .addImm(LoopSize >> 16) .add(predOps(ARMCC::AL)); } else { MachineConstantPool *ConstantPool = MF->getConstantPool(); Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext()); const Constant *C = ConstantInt::get(Int32Ty, LoopSize); // MachineConstantPool wants an explicit alignment. unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty); if (Align == 0) Align = MF->getDataLayout().getTypeAllocSize(C->getType()); unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align); if (IsThumb) BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)) .addReg(varEnd, RegState::Define) .addConstantPoolIndex(Idx) .add(predOps(ARMCC::AL)); else BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)) .addReg(varEnd, RegState::Define) .addConstantPoolIndex(Idx) .addImm(0) .add(predOps(ARMCC::AL)); } BB->addSuccessor(loopMBB); // Generate the loop body: // varPhi = PHI(varLoop, varEnd) // srcPhi = PHI(srcLoop, src) // destPhi = PHI(destLoop, dst) MachineBasicBlock *entryBB = BB; BB = loopMBB; unsigned varLoop = MRI.createVirtualRegister(TRC); unsigned varPhi = MRI.createVirtualRegister(TRC); unsigned srcLoop = MRI.createVirtualRegister(TRC); unsigned srcPhi = MRI.createVirtualRegister(TRC); unsigned destLoop = MRI.createVirtualRegister(TRC); unsigned destPhi = MRI.createVirtualRegister(TRC); BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi) .addReg(varLoop).addMBB(loopMBB) .addReg(varEnd).addMBB(entryBB); BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi) .addReg(srcLoop).addMBB(loopMBB) .addReg(src).addMBB(entryBB); BuildMI(BB, dl, TII->get(ARM::PHI), destPhi) .addReg(destLoop).addMBB(loopMBB) .addReg(dest).addMBB(entryBB); // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize) // [destLoop] = STR_POST(scratch, destPhi, UnitSiz) unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC); emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop, IsThumb1, IsThumb2); emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop, IsThumb1, IsThumb2); // Decrement loop variable by UnitSize. if (IsThumb1) { BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop) .add(t1CondCodeOp()) .addReg(varPhi) .addImm(UnitSize) .add(predOps(ARMCC::AL)); } else { MachineInstrBuilder MIB = BuildMI(*BB, BB->end(), dl, TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop); MIB.addReg(varPhi) .addImm(UnitSize) .add(predOps(ARMCC::AL)) .add(condCodeOp()); MIB->getOperand(5).setReg(ARM::CPSR); MIB->getOperand(5).setIsDef(true); } BuildMI(*BB, BB->end(), dl, TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc)) .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR); // loopMBB can loop back to loopMBB or fall through to exitMBB. BB->addSuccessor(loopMBB); BB->addSuccessor(exitMBB); // Add epilogue to handle BytesLeft. BB = exitMBB; auto StartOfExit = exitMBB->begin(); // [scratch, srcOut] = LDRB_POST(srcLoop, 1) // [destOut] = STRB_POST(scratch, destLoop, 1) unsigned srcIn = srcLoop; unsigned destIn = destLoop; for (unsigned i = 0; i < BytesLeft; i++) { unsigned srcOut = MRI.createVirtualRegister(TRC); unsigned destOut = MRI.createVirtualRegister(TRC); unsigned scratch = MRI.createVirtualRegister(TRC); emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut, IsThumb1, IsThumb2); emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut, IsThumb1, IsThumb2); srcIn = srcOut; destIn = destOut; } MI.eraseFromParent(); // The instruction is gone now. return BB; } MachineBasicBlock * ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI, MachineBasicBlock *MBB) const { const TargetMachine &TM = getTargetMachine(); const TargetInstrInfo &TII = *Subtarget->getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); assert(Subtarget->isTargetWindows() && "__chkstk is only supported on Windows"); assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode"); // __chkstk takes the number of words to allocate on the stack in R4, and // returns the stack adjustment in number of bytes in R4. This will not // clober any other registers (other than the obvious lr). // // Although, technically, IP should be considered a register which may be // clobbered, the call itself will not touch it. Windows on ARM is a pure // thumb-2 environment, so there is no interworking required. As a result, we // do not expect a veneer to be emitted by the linker, clobbering IP. // // Each module receives its own copy of __chkstk, so no import thunk is // required, again, ensuring that IP is not clobbered. // // Finally, although some linkers may theoretically provide a trampoline for // out of range calls (which is quite common due to a 32M range limitation of // branches for Thumb), we can generate the long-call version via // -mcmodel=large, alleviating the need for the trampoline which may clobber // IP. switch (TM.getCodeModel()) { case CodeModel::Small: case CodeModel::Medium: case CodeModel::Kernel: BuildMI(*MBB, MI, DL, TII.get(ARM::tBL)) .add(predOps(ARMCC::AL)) .addExternalSymbol("__chkstk") .addReg(ARM::R4, RegState::Implicit | RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Define) .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead) .addReg(ARM::CPSR, RegState::Implicit | RegState::Define | RegState::Dead); break; case CodeModel::Large: { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass); BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg) .addExternalSymbol("__chkstk"); BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr)) .add(predOps(ARMCC::AL)) .addReg(Reg, RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Kill) .addReg(ARM::R4, RegState::Implicit | RegState::Define) .addReg(ARM::R12, RegState::Implicit | RegState::Define | RegState::Dead) .addReg(ARM::CPSR, RegState::Implicit | RegState::Define | RegState::Dead); break; } } BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP) .addReg(ARM::SP, RegState::Kill) .addReg(ARM::R4, RegState::Kill) .setMIFlags(MachineInstr::FrameSetup) .add(predOps(ARMCC::AL)) .add(condCodeOp()); MI.eraseFromParent(); return MBB; } MachineBasicBlock * ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const { DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MBB->getParent(); const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock(); MF->insert(++MBB->getIterator(), ContBB); ContBB->splice(ContBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), MBB->end()); ContBB->transferSuccessorsAndUpdatePHIs(MBB); MBB->addSuccessor(ContBB); MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0)); MF->push_back(TrapBB); MBB->addSuccessor(TrapBB); BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8)) .addReg(MI.getOperand(0).getReg()) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc)) .addMBB(TrapBB) .addImm(ARMCC::EQ) .addReg(ARM::CPSR); MI.eraseFromParent(); return ContBB; } MachineBasicBlock * ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { const TargetInstrInfo *TII = Subtarget->getInstrInfo(); DebugLoc dl = MI.getDebugLoc(); bool isThumb2 = Subtarget->isThumb2(); switch (MI.getOpcode()) { default: { MI.print(errs()); llvm_unreachable("Unexpected instr type to insert"); } // Thumb1 post-indexed loads are really just single-register LDMs. case ARM::tLDR_postidx: { MachineOperand Def(MI.getOperand(1)); BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD)) .add(Def) // Rn_wb .add(MI.getOperand(2)) // Rn .add(MI.getOperand(3)) // PredImm .add(MI.getOperand(4)) // PredReg .add(MI.getOperand(0)); // Rt MI.eraseFromParent(); return BB; } // The Thumb2 pre-indexed stores have the same MI operands, they just // define them differently in the .td files from the isel patterns, so // they need pseudos. case ARM::t2STR_preidx: MI.setDesc(TII->get(ARM::t2STR_PRE)); return BB; case ARM::t2STRB_preidx: MI.setDesc(TII->get(ARM::t2STRB_PRE)); return BB; case ARM::t2STRH_preidx: MI.setDesc(TII->get(ARM::t2STRH_PRE)); return BB; case ARM::STRi_preidx: case ARM::STRBi_preidx: { unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM : ARM::STRB_PRE_IMM; // Decode the offset. unsigned Offset = MI.getOperand(4).getImm(); bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub; Offset = ARM_AM::getAM2Offset(Offset); if (isSub) Offset = -Offset; MachineMemOperand *MMO = *MI.memoperands_begin(); BuildMI(*BB, MI, dl, TII->get(NewOpc)) .add(MI.getOperand(0)) // Rn_wb .add(MI.getOperand(1)) // Rt .add(MI.getOperand(2)) // Rn .addImm(Offset) // offset (skip GPR==zero_reg) .add(MI.getOperand(5)) // pred .add(MI.getOperand(6)) .addMemOperand(MMO); MI.eraseFromParent(); return BB; } case ARM::STRr_preidx: case ARM::STRBr_preidx: case ARM::STRH_preidx: { unsigned NewOpc; switch (MI.getOpcode()) { default: llvm_unreachable("unexpected opcode!"); case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break; case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break; case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break; } MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc)); for (unsigned i = 0; i < MI.getNumOperands(); ++i) MIB.add(MI.getOperand(i)); MI.eraseFromParent(); return BB; } case ARM::tMOVCCr_pseudo: { // To "insert" a SELECT_CC instruction, we actually have to insert the // diamond control-flow pattern. The incoming instruction knows the // destination vreg to set, the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator It = ++BB->getIterator(); // thisMBB: // ... // TrueVal = ... // cmpTY ccX, r1, r2 // bCC copy1MBB // fallthrough --> copy0MBB MachineBasicBlock *thisMBB = BB; MachineFunction *F = BB->getParent(); MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); F->insert(It, copy0MBB); F->insert(It, sinkMBB); // Transfer the remainder of BB and its successor edges to sinkMBB. sinkMBB->splice(sinkMBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); sinkMBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(copy0MBB); BB->addSuccessor(sinkMBB); BuildMI(BB, dl, TII->get(ARM::tBcc)) .addMBB(sinkMBB) .addImm(MI.getOperand(3).getImm()) .addReg(MI.getOperand(4).getReg()); // copy0MBB: // %FalseValue = ... // # fallthrough to sinkMBB BB = copy0MBB; // Update machine-CFG edges BB->addSuccessor(sinkMBB); // sinkMBB: // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] // ... BB = sinkMBB; BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg()) .addReg(MI.getOperand(1).getReg()) .addMBB(copy0MBB) .addReg(MI.getOperand(2).getReg()) .addMBB(thisMBB); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } case ARM::BCCi64: case ARM::BCCZi64: { // If there is an unconditional branch to the other successor, remove it. BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end()); // Compare both parts that make up the double comparison separately for // equality. bool RHSisZero = MI.getOpcode() == ARM::BCCZi64; unsigned LHS1 = MI.getOperand(1).getReg(); unsigned LHS2 = MI.getOperand(2).getReg(); if (RHSisZero) { BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(LHS1) .addImm(0) .add(predOps(ARMCC::AL)); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(LHS2).addImm(0) .addImm(ARMCC::EQ).addReg(ARM::CPSR); } else { unsigned RHS1 = MI.getOperand(3).getReg(); unsigned RHS2 = MI.getOperand(4).getReg(); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(LHS1) .addReg(RHS1) .add(predOps(ARMCC::AL)); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr)) .addReg(LHS2).addReg(RHS2) .addImm(ARMCC::EQ).addReg(ARM::CPSR); } MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB(); MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB); if (MI.getOperand(0).getImm() == ARMCC::NE) std::swap(destMBB, exitMBB); BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)) .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR); if (isThumb2) BuildMI(BB, dl, TII->get(ARM::t2B)) .addMBB(exitMBB) .add(predOps(ARMCC::AL)); else BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB); MI.eraseFromParent(); // The pseudo instruction is gone now. return BB; } case ARM::Int_eh_sjlj_setjmp: case ARM::Int_eh_sjlj_setjmp_nofp: case ARM::tInt_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp: case ARM::t2Int_eh_sjlj_setjmp_nofp: return BB; case ARM::Int_eh_sjlj_setup_dispatch: EmitSjLjDispatchBlock(MI, BB); return BB; case ARM::ABS: case ARM::t2ABS: { // To insert an ABS instruction, we have to insert the // diamond control-flow pattern. The incoming instruction knows the // source vreg to test against 0, the destination vreg to set, // the condition code register to branch on, the // true/false values to select between, and a branch opcode to use. // It transforms // V1 = ABS V0 // into // V2 = MOVS V0 // BCC (branch to SinkBB if V0 >= 0) // RSBBB: V3 = RSBri V2, 0 (compute ABS if V2 < 0) // SinkBB: V1 = PHI(V2, V3) const BasicBlock *LLVM_BB = BB->getBasicBlock(); MachineFunction::iterator BBI = ++BB->getIterator(); MachineFunction *Fn = BB->getParent(); MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *SinkBB = Fn->CreateMachineBasicBlock(LLVM_BB); Fn->insert(BBI, RSBBB); Fn->insert(BBI, SinkBB); unsigned int ABSSrcReg = MI.getOperand(1).getReg(); unsigned int ABSDstReg = MI.getOperand(0).getReg(); bool ABSSrcKIll = MI.getOperand(1).isKill(); bool isThumb2 = Subtarget->isThumb2(); MachineRegisterInfo &MRI = Fn->getRegInfo(); // In Thumb mode S must not be specified if source register is the SP or // PC and if destination register is the SP, so restrict register class unsigned NewRsbDstReg = MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass); // Transfer the remainder of BB and its successor edges to sinkMBB. SinkBB->splice(SinkBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)), BB->end()); SinkBB->transferSuccessorsAndUpdatePHIs(BB); BB->addSuccessor(RSBBB); BB->addSuccessor(SinkBB); // fall through to SinkMBB RSBBB->addSuccessor(SinkBB); // insert a cmp at the end of BB BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri)) .addReg(ABSSrcReg) .addImm(0) .add(predOps(ARMCC::AL)); // insert a bcc with opposite CC to ARMCC::MI at the end of BB BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB) .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR); // insert rsbri in RSBBB // Note: BCC and rsbri will be converted into predicated rsbmi // by if-conversion pass BuildMI(*RSBBB, RSBBB->begin(), dl, TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg) .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0) .addImm(0) .add(predOps(ARMCC::AL)) .add(condCodeOp()); // insert PHI in SinkBB, // reuse ABSDstReg to not change uses of ABS instruction BuildMI(*SinkBB, SinkBB->begin(), dl, TII->get(ARM::PHI), ABSDstReg) .addReg(NewRsbDstReg).addMBB(RSBBB) .addReg(ABSSrcReg).addMBB(BB); // remove ABS instruction MI.eraseFromParent(); // return last added BB return SinkBB; } case ARM::COPY_STRUCT_BYVAL_I32: ++NumLoopByVals; return EmitStructByval(MI, BB); case ARM::WIN__CHKSTK: return EmitLowered__chkstk(MI, BB); case ARM::WIN__DBZCHK: return EmitLowered__dbzchk(MI, BB); } } /// Attaches vregs to MEMCPY that it will use as scratch registers /// when it is expanded into LDM/STM. This is done as a post-isel lowering /// instead of as a custom inserter because we need the use list from the SDNode. static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node) { bool isThumb1 = Subtarget->isThumb1Only(); DebugLoc DL = MI.getDebugLoc(); MachineFunction *MF = MI.getParent()->getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); MachineInstrBuilder MIB(*MF, MI); // If the new dst/src is unused mark it as dead. if (!Node->hasAnyUseOfValue(0)) { MI.getOperand(0).setIsDead(true); } if (!Node->hasAnyUseOfValue(1)) { MI.getOperand(1).setIsDead(true); } // The MEMCPY both defines and kills the scratch registers. for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) { unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass); MIB.addReg(TmpReg, RegState::Define|RegState::Dead); } } void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { if (MI.getOpcode() == ARM::MEMCPY) { attachMEMCPYScratchRegs(Subtarget, MI, Node); return; } const MCInstrDesc *MCID = &MI.getDesc(); // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB, // RSC. Coming out of isel, they have an implicit CPSR def, but the optional // operand is still set to noreg. If needed, set the optional operand's // register to CPSR, and remove the redundant implicit def. // // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR). // Rename pseudo opcodes. unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode()); unsigned ccOutIdx; if (NewOpc) { const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo(); MCID = &TII->get(NewOpc); assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize() && "converted opcode should be the same except for cc_out" " (and, on Thumb1, pred)"); MI.setDesc(*MCID); // Add the optional cc_out operand MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true)); // On Thumb1, move all input operands to the end, then add the predicate if (Subtarget->isThumb1Only()) { for (unsigned c = MCID->getNumOperands() - 4; c--;) { MI.addOperand(MI.getOperand(1)); MI.RemoveOperand(1); } // Restore the ties for (unsigned i = MI.getNumOperands(); i--;) { const MachineOperand& op = MI.getOperand(i); if (op.isReg() && op.isUse()) { int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO); if (DefIdx != -1) MI.tieOperands(DefIdx, i); } } MI.addOperand(MachineOperand::CreateImm(ARMCC::AL)); MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false)); ccOutIdx = 1; } else ccOutIdx = MCID->getNumOperands() - 1; } else ccOutIdx = MCID->getNumOperands() - 1; // Any ARM instruction that sets the 's' bit should specify an optional // "cc_out" operand in the last operand position. if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) { assert(!NewOpc && "Optional cc_out operand required"); return; } // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it // since we already have an optional CPSR def. bool definesCPSR = false; bool deadCPSR = false; for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI.getOperand(i); if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) { definesCPSR = true; if (MO.isDead()) deadCPSR = true; MI.RemoveOperand(i); break; } } if (!definesCPSR) { assert(!NewOpc && "Optional cc_out operand required"); return; } assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag"); if (deadCPSR) { assert(!MI.getOperand(ccOutIdx).getReg() && "expect uninitialized optional cc_out operand"); // Thumb1 instructions must have the S bit even if the CPSR is dead. if (!Subtarget->isThumb1Only()) return; } // If this instruction was defined with an optional CPSR def and its dag node // had a live implicit CPSR def, then activate the optional CPSR def. MachineOperand &MO = MI.getOperand(ccOutIdx); MO.setReg(ARM::CPSR); MO.setIsDef(true); } //===----------------------------------------------------------------------===// // ARM Optimization Hooks //===----------------------------------------------------------------------===// // Helper function that checks if N is a null or all ones constant. static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) { return AllOnes ? isAllOnesConstant(N) : isNullConstant(N); } // Return true if N is conditionally 0 or all ones. // Detects these expressions where cc is an i1 value: // // (select cc 0, y) [AllOnes=0] // (select cc y, 0) [AllOnes=0] // (zext cc) [AllOnes=0] // (sext cc) [AllOnes=0/1] // (select cc -1, y) [AllOnes=1] // (select cc y, -1) [AllOnes=1] // // Invert is set when N is the null/all ones constant when CC is false. // OtherOp is set to the alternative value of N. static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG) { switch (N->getOpcode()) { default: return false; case ISD::SELECT: { CC = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue N2 = N->getOperand(2); if (isZeroOrAllOnes(N1, AllOnes)) { Invert = false; OtherOp = N2; return true; } if (isZeroOrAllOnes(N2, AllOnes)) { Invert = true; OtherOp = N1; return true; } return false; } case ISD::ZERO_EXTEND: // (zext cc) can never be the all ones value. if (AllOnes) return false; LLVM_FALLTHROUGH; case ISD::SIGN_EXTEND: { SDLoc dl(N); EVT VT = N->getValueType(0); CC = N->getOperand(0); if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC) return false; Invert = !AllOnes; if (AllOnes) // When looking for an AllOnes constant, N is an sext, and the 'other' // value is 0. OtherOp = DAG.getConstant(0, dl, VT); else if (N->getOpcode() == ISD::ZERO_EXTEND) // When looking for a 0 constant, N can be zext or sext. OtherOp = DAG.getConstant(1, dl, VT); else OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl, VT); return true; } } } // Combine a constant select operand into its use: // // (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) // (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) // (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1] // (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) // (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) // // The transform is rejected if the select doesn't have a constant operand that // is null, or all ones when AllOnes is set. // // Also recognize sext/zext from i1: // // (add (zext cc), x) -> (select cc (add x, 1), x) // (add (sext cc), x) -> (select cc (add x, -1), x) // // These transformations eventually create predicated instructions. // // @param N The node to transform. // @param Slct The N operand that is a select. // @param OtherOp The other N operand (x above). // @param DCI Context. // @param AllOnes Require the select constant to be all ones instead of null. // @returns The new node, or SDValue() on failure. static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes = false) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); SDValue NonConstantVal; SDValue CCOp; bool SwapSelectOps; if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps, NonConstantVal, DAG)) return SDValue(); // Slct is now know to be the desired identity constant when CC is true. SDValue TrueVal = OtherOp; SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT, OtherOp, NonConstantVal); // Unless SwapSelectOps says CC should be false. if (SwapSelectOps) std::swap(TrueVal, FalseVal); return DAG.getNode(ISD::SELECT, SDLoc(N), VT, CCOp, TrueVal, FalseVal); } // Attempt combineSelectAndUse on each operand of a commutative operator N. static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); if (N0.getNode()->hasOneUse()) if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes)) return Result; if (N1.getNode()->hasOneUse()) if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes)) return Result; return SDValue(); } static bool IsVUZPShuffleNode(SDNode *N) { // VUZP shuffle node. if (N->getOpcode() == ARMISD::VUZP) return true; // "VUZP" on i32 is an alias for VTRN. if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32) return true; return false; } static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Look for ADD(VUZP.0, VUZP.1). if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() || N0 == N1) return SDValue(); // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD. if (!N->getValueType(0).is64BitVector()) return SDValue(); // Generate vpadd. SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); SDNode *Unzip = N0.getNode(); EVT VT = N->getValueType(0); SmallVector Ops; Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl, TLI.getPointerTy(DAG.getDataLayout()))); Ops.push_back(Unzip->getOperand(0)); Ops.push_back(Unzip->getOperand(1)); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); } static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Check for two extended operands. if (!(N0.getOpcode() == ISD::SIGN_EXTEND && N1.getOpcode() == ISD::SIGN_EXTEND) && !(N0.getOpcode() == ISD::ZERO_EXTEND && N1.getOpcode() == ISD::ZERO_EXTEND)) return SDValue(); SDValue N00 = N0.getOperand(0); SDValue N10 = N1.getOperand(0); // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1)) if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() || N00 == N10) return SDValue(); // We only recognize Q register paddl here; this can't be reached until // after type legalization. if (!N00.getValueType().is64BitVector() || !N0.getValueType().is128BitVector()) return SDValue(); // Generate vpaddl. SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); EVT VT = N->getValueType(0); SmallVector Ops; // Form vpaddl.sN or vpaddl.uN depending on the kind of extension. unsigned Opcode; if (N0.getOpcode() == ISD::SIGN_EXTEND) Opcode = Intrinsic::arm_neon_vpaddls; else Opcode = Intrinsic::arm_neon_vpaddlu; Ops.push_back(DAG.getConstant(Opcode, dl, TLI.getPointerTy(DAG.getDataLayout()))); EVT ElemTy = N00.getValueType().getVectorElementType(); unsigned NumElts = VT.getVectorNumElements(); EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2); SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT, N00.getOperand(0), N00.getOperand(1)); Ops.push_back(Concat); return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops); } // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is // much easier to match. static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Only perform optimization if after legalize, and if NEON is available. We // also expected both operands to be BUILD_VECTORs. if (DCI.isBeforeLegalize() || !Subtarget->hasNEON() || N0.getOpcode() != ISD::BUILD_VECTOR || N1.getOpcode() != ISD::BUILD_VECTOR) return SDValue(); // Check output type since VPADDL operand elements can only be 8, 16, or 32. EVT VT = N->getValueType(0); if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64) return SDValue(); // Check that the vector operands are of the right form. // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR // operands, where N is the size of the formed vector. // Each EXTRACT_VECTOR should have the same input vector and odd or even // index such that we have a pair wise add pattern. // Grab the vector that all EXTRACT_VECTOR nodes should be referencing. if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); SDValue Vec = N0->getOperand(0)->getOperand(0); SDNode *V = Vec.getNode(); unsigned nextIndex = 0; // For each operands to the ADD which are BUILD_VECTORs, // check to see if each of their operands are an EXTRACT_VECTOR with // the same vector and appropriate index. for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) { if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue ExtVec0 = N0->getOperand(i); SDValue ExtVec1 = N1->getOperand(i); // First operand is the vector, verify its the same. if (V != ExtVec0->getOperand(0).getNode() || V != ExtVec1->getOperand(0).getNode()) return SDValue(); // Second is the constant, verify its correct. ConstantSDNode *C0 = dyn_cast(ExtVec0->getOperand(1)); ConstantSDNode *C1 = dyn_cast(ExtVec1->getOperand(1)); // For the constant, we want to see all the even or all the odd. if (!C0 || !C1 || C0->getZExtValue() != nextIndex || C1->getZExtValue() != nextIndex+1) return SDValue(); // Increment index. nextIndex+=2; } else return SDValue(); } // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure // we're using the entire input vector, otherwise there's a size/legality // mismatch somewhere. if (nextIndex != Vec.getValueType().getVectorNumElements() || Vec.getValueType().getVectorElementType() == VT.getVectorElementType()) return SDValue(); // Create VPADDL node. SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDLoc dl(N); // Build operand list. SmallVector Ops; Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl, TLI.getPointerTy(DAG.getDataLayout()))); // Input is the vector. Ops.push_back(Vec); // Get widened type and narrowed type. MVT widenType; unsigned numElem = VT.getVectorNumElements(); EVT inputLaneType = Vec.getValueType().getVectorElementType(); switch (inputLaneType.getSimpleVT().SimpleTy) { case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break; case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break; case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break; default: llvm_unreachable("Invalid vector element type for padd optimization."); } SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops); unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE; return DAG.getNode(ExtOp, dl, VT, tmp); } static SDValue findMUL_LOHI(SDValue V) { if (V->getOpcode() == ISD::UMUL_LOHI || V->getOpcode() == ISD::SMUL_LOHI) return V; return SDValue(); } static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (Subtarget->isThumb()) { if (!Subtarget->hasDSP()) return SDValue(); } else if (!Subtarget->hasV5TEOps()) return SDValue(); // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and // accumulates the product into a 64-bit value. The 16-bit values will // be sign extended somehow or SRA'd into 32-bit values // (addc (adde (mul 16bit, 16bit), lo), hi) SDValue Mul = AddcNode->getOperand(0); SDValue Lo = AddcNode->getOperand(1); if (Mul.getOpcode() != ISD::MUL) { Lo = AddcNode->getOperand(0); Mul = AddcNode->getOperand(1); if (Mul.getOpcode() != ISD::MUL) return SDValue(); } SDValue SRA = AddeNode->getOperand(0); SDValue Hi = AddeNode->getOperand(1); if (SRA.getOpcode() != ISD::SRA) { SRA = AddeNode->getOperand(1); Hi = AddeNode->getOperand(0); if (SRA.getOpcode() != ISD::SRA) return SDValue(); } if (auto Const = dyn_cast(SRA.getOperand(1))) { if (Const->getZExtValue() != 31) return SDValue(); } else return SDValue(); if (SRA.getOperand(0) != Mul) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDLoc dl(AddcNode); unsigned Opcode = 0; SDValue Op0; SDValue Op1; if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) { Opcode = ARMISD::SMLALBB; Op0 = Mul.getOperand(0); Op1 = Mul.getOperand(1); } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) { Opcode = ARMISD::SMLALBT; Op0 = Mul.getOperand(0); Op1 = Mul.getOperand(1).getOperand(0); } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) { Opcode = ARMISD::SMLALTB; Op0 = Mul.getOperand(0).getOperand(0); Op1 = Mul.getOperand(1); } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) { Opcode = ARMISD::SMLALTT; Op0 = Mul->getOperand(0).getOperand(0); Op1 = Mul->getOperand(1).getOperand(0); } if (!Op0 || !Op1) return SDValue(); SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32), Op0, Op1, Lo, Hi); // Replace the ADDs' nodes uses by the MLA node's values. SDValue HiMLALResult(SMLAL.getNode(), 1); SDValue LoMLALResult(SMLAL.getNode(), 0); DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult); DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult); // Return original node to notify the driver to stop replacing. SDValue resNode(AddcNode, 0); return resNode; } static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Look for multiply add opportunities. // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where // each add nodes consumes a value from ISD::UMUL_LOHI and there is // a glue link from the first add to the second add. // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by // a S/UMLAL instruction. // UMUL_LOHI // / :lo \ :hi // V \ [no multiline comment] // loAdd -> ADDC | // \ :carry / // V V // ADDE <- hiAdd // // In the special case where only the higher part of a signed result is used // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts // a constant with the exact value of 0x80000000, we recognize we are dealing // with a "rounded multiply and add" (or subtract) and transform it into // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively. assert((AddeSubeNode->getOpcode() == ARMISD::ADDE || AddeSubeNode->getOpcode() == ARMISD::SUBE) && "Expect an ADDE or SUBE"); assert(AddeSubeNode->getNumOperands() == 3 && AddeSubeNode->getOperand(2).getValueType() == MVT::i32 && "ADDE node has the wrong inputs"); // Check that we are chained to the right ADDC or SUBC node. SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode(); if ((AddeSubeNode->getOpcode() == ARMISD::ADDE && AddcSubcNode->getOpcode() != ARMISD::ADDC) || (AddeSubeNode->getOpcode() == ARMISD::SUBE && AddcSubcNode->getOpcode() != ARMISD::SUBC)) return SDValue(); SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0); SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1); // Check if the two operands are from the same mul_lohi node. if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode()) return SDValue(); assert(AddcSubcNode->getNumValues() == 2 && AddcSubcNode->getValueType(0) == MVT::i32 && "Expect ADDC with two result values. First: i32"); // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it // maybe a SMLAL which multiplies two 16-bit values. if (AddeSubeNode->getOpcode() == ARMISD::ADDE && AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI && AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI && AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI && AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI) return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget); // Check for the triangle shape. SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0); SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1); // Make sure that the ADDE/SUBE operands are not coming from the same node. if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode()) return SDValue(); // Find the MUL_LOHI node walking up ADDE/SUBE's operands. bool IsLeftOperandMUL = false; SDValue MULOp = findMUL_LOHI(AddeSubeOp0); if (MULOp == SDValue()) MULOp = findMUL_LOHI(AddeSubeOp1); else IsLeftOperandMUL = true; if (MULOp == SDValue()) return SDValue(); // Figure out the right opcode. unsigned Opc = MULOp->getOpcode(); unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL; // Figure out the high and low input values to the MLAL node. SDValue *HiAddSub = nullptr; SDValue *LoMul = nullptr; SDValue *LowAddSub = nullptr; // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI. if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1))) return SDValue(); if (IsLeftOperandMUL) HiAddSub = &AddeSubeOp1; else HiAddSub = &AddeSubeOp0; // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node // whose low result is fed to the ADDC/SUBC we are checking. if (AddcSubcOp0 == MULOp.getValue(0)) { LoMul = &AddcSubcOp0; LowAddSub = &AddcSubcOp1; } if (AddcSubcOp1 == MULOp.getValue(0)) { LoMul = &AddcSubcOp1; LowAddSub = &AddcSubcOp0; } if (!LoMul) return SDValue(); // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC // the replacement below will create a cycle. if (AddcSubcNode == HiAddSub->getNode() || AddcSubcNode->isPredecessorOf(HiAddSub->getNode())) return SDValue(); // Create the merged node. SelectionDAG &DAG = DCI.DAG; // Start building operand list. SmallVector Ops; Ops.push_back(LoMul->getOperand(0)); Ops.push_back(LoMul->getOperand(1)); // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be // the case, we must be doing signed multiplication and only use the higher // part of the result of the MLAL, furthermore the LowAddSub must be a constant // addition or subtraction with the value of 0x800000. if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() && FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) && LowAddSub->getNode()->getOpcode() == ISD::Constant && static_cast(LowAddSub->getNode())->getZExtValue() == 0x80000000) { Ops.push_back(*HiAddSub); if (AddcSubcNode->getOpcode() == ARMISD::SUBC) { FinalOpc = ARMISD::SMMLSR; } else { FinalOpc = ARMISD::SMMLAR; } SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops); DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode); return SDValue(AddeSubeNode, 0); } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC) // SMMLS is generated during instruction selection and the rest of this // function can not handle the case where AddcSubcNode is a SUBC. return SDValue(); // Finish building the operand list for {U/S}MLAL Ops.push_back(*LowAddSub); Ops.push_back(*HiAddSub); SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), DAG.getVTList(MVT::i32, MVT::i32), Ops); // Replace the ADDs' nodes uses by the MLA node's values. SDValue HiMLALResult(MLALNode.getNode(), 1); DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult); SDValue LoMLALResult(MLALNode.getNode(), 0); DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult); // Return original node to notify the driver to stop replacing. return SDValue(AddeSubeNode, 0); } static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // UMAAL is similar to UMLAL except that it adds two unsigned values. // While trying to combine for the other MLAL nodes, first search for the // chance to use UMAAL. Check if Addc uses a node which has already // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde // as the addend, and it's handled in PerformUMLALCombine. if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); // Check that we have a glued ADDC node. SDNode* AddcNode = AddeNode->getOperand(2).getNode(); if (AddcNode->getOpcode() != ARMISD::ADDC) return SDValue(); // Find the converted UMAAL or quit if it doesn't exist. SDNode *UmlalNode = nullptr; SDValue AddHi; if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) { UmlalNode = AddcNode->getOperand(0).getNode(); AddHi = AddcNode->getOperand(1); } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) { UmlalNode = AddcNode->getOperand(1).getNode(); AddHi = AddcNode->getOperand(0); } else { return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget); } // The ADDC should be glued to an ADDE node, which uses the same UMLAL as // the ADDC as well as Zero. if (!isNullConstant(UmlalNode->getOperand(3))) return SDValue(); if ((isNullConstant(AddeNode->getOperand(0)) && AddeNode->getOperand(1).getNode() == UmlalNode) || (AddeNode->getOperand(0).getNode() == UmlalNode && isNullConstant(AddeNode->getOperand(1)))) { SelectionDAG &DAG = DCI.DAG; SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1), UmlalNode->getOperand(2), AddHi }; SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode), DAG.getVTList(MVT::i32, MVT::i32), Ops); // Replace the ADDs' nodes uses by the UMAAL node's values. DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0)); // Return original node to notify the driver to stop replacing. return SDValue(AddeNode, 0); } return SDValue(); } static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP()) return SDValue(); // Check that we have a pair of ADDC and ADDE as operands. // Both addends of the ADDE must be zero. SDNode* AddcNode = N->getOperand(2).getNode(); SDNode* AddeNode = N->getOperand(3).getNode(); if ((AddcNode->getOpcode() == ARMISD::ADDC) && (AddeNode->getOpcode() == ARMISD::ADDE) && isNullConstant(AddeNode->getOperand(0)) && isNullConstant(AddeNode->getOperand(1)) && (AddeNode->getOperand(2).getNode() == AddcNode)) return DAG.getNode(ARMISD::UMAAL, SDLoc(N), DAG.getVTList(MVT::i32, MVT::i32), {N->getOperand(0), N->getOperand(1), AddcNode->getOperand(0), AddcNode->getOperand(1)}); else return SDValue(); } static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SelectionDAG &DAG(DCI.DAG); if (N->getOpcode() == ARMISD::SUBC) { // (SUBC (ADDE 0, 0, C), 1) -> C SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); if (LHS->getOpcode() == ARMISD::ADDE && isNullConstant(LHS->getOperand(0)) && isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) { return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); } } if (Subtarget->isThumb1Only()) { SDValue RHS = N->getOperand(1); if (ConstantSDNode *C = dyn_cast(RHS)) { int32_t imm = C->getSExtValue(); if (imm < 0 && imm > std::numeric_limits::min()) { SDLoc DL(N); RHS = DAG.getConstant(-imm, DL, MVT::i32); unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC : ARMISD::ADDC; return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS); } } } return SDValue(); } static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (Subtarget->isThumb1Only()) { SelectionDAG &DAG = DCI.DAG; SDValue RHS = N->getOperand(1); if (ConstantSDNode *C = dyn_cast(RHS)) { int64_t imm = C->getSExtValue(); if (imm < 0) { SDLoc DL(N); // The with-carry-in form matches bitwise not instead of the negation. // Effectively, the inverse interpretation of the carry flag already // accounts for part of the negation. RHS = DAG.getConstant(~imm, DL, MVT::i32); unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE : ARMISD::ADDE; return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS, N->getOperand(2)); } } } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) { return AddCombineTo64bitMLAL(N, DCI, Subtarget); } return SDValue(); } /// PerformADDECombine - Target-specific dag combine transform from /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Only ARM and Thumb2 support UMLAL/SMLAL. if (Subtarget->isThumb1Only()) return PerformAddeSubeCombine(N, DCI, Subtarget); // Only perform the checks after legalize when the pattern is available. if (DCI.isBeforeLegalize()) return SDValue(); return AddCombineTo64bitUMAAL(N, DCI, Subtarget); } /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with /// operands N0 and N1. This is a helper for PerformADDCombine that is /// called with the default operands, and if that fails, with commuted /// operands. static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget){ // Attempt to create vpadd for this add. if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget)) return Result; // Attempt to create vpaddl for this add. if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget)) return Result; if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI, Subtarget)) return Result; // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c)) if (N0.getNode()->hasOneUse()) if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI)) return Result; return SDValue(); } +bool +ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N, + CombineLevel Level) const { + if (Level == BeforeLegalizeTypes) + return true; + + if (Subtarget->isThumb() && Subtarget->isThumb1Only()) + return true; + + if (N->getOpcode() != ISD::SHL) + return true; + + // Turn off commute-with-shift transform after legalization, so it doesn't + // conflict with PerformSHLSimplify. (We could try to detect when + // PerformSHLSimplify would trigger more precisely, but it isn't + // really necessary.) + return false; +} + static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { // Allow the generic combiner to identify potential bswaps. if (DCI.isBeforeLegalize()) return SDValue(); // DAG combiner will fold: // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2 // Other code patterns that can be also be modified have the following form: // b + ((a << 1) | 510) // b + ((a << 1) & 510) // b + ((a << 1) ^ 510) // b + ((a << 1) + 510) // Many instructions can perform the shift for free, but it requires both // the operands to be registers. If c1 << c2 is too large, a mov immediate // instruction will needed. So, unfold back to the original pattern if: // - if c1 and c2 are small enough that they don't require mov imms. // - the user(s) of the node can perform an shl // No shifted operands for 16-bit instructions. if (ST->isThumb() && ST->isThumb1Only()) return SDValue(); // Check that all the users could perform the shl themselves. for (auto U : N->uses()) { switch(U->getOpcode()) { default: return SDValue(); case ISD::SUB: case ISD::ADD: case ISD::AND: case ISD::OR: case ISD::XOR: case ISD::SETCC: case ARMISD::CMP: // Check that the user isn't already using a constant because there // aren't any instructions that support an immediate operand and a // shifted operand. if (isa(U->getOperand(0)) || isa(U->getOperand(1))) return SDValue(); // Check that it's not already using a shift. if (U->getOperand(0).getOpcode() == ISD::SHL || U->getOperand(1).getOpcode() == ISD::SHL) return SDValue(); break; } } if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR && N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND) return SDValue(); if (N->getOperand(0).getOpcode() != ISD::SHL) return SDValue(); SDValue SHL = N->getOperand(0); auto *C1ShlC2 = dyn_cast(N->getOperand(1)); auto *C2 = dyn_cast(SHL.getOperand(1)); if (!C1ShlC2 || !C2) return SDValue(); APInt C2Int = C2->getAPIntValue(); APInt C1Int = C1ShlC2->getAPIntValue(); // Check that performing a lshr will not lose any information. APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), C2Int.getBitWidth() - C2->getZExtValue()); if ((C1Int & Mask) != C1Int) return SDValue(); // Shift the first constant. C1Int.lshrInPlace(C2Int); // The immediates are encoded as an 8-bit value that can be rotated. auto LargeImm = [](const APInt &Imm) { unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros(); return Imm.getBitWidth() - Zeros > 8; }; if (LargeImm(C1Int) || LargeImm(C2Int)) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); SDValue X = SHL.getOperand(0); SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, DAG.getConstant(C1Int, dl, MVT::i32)); // Shift left to compensate for the lshr of C1Int. SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump(); SHL.dump(); N->dump()); LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump()); - - DAG.ReplaceAllUsesWith(SDValue(N, 0), Res); - return SDValue(N, 0); + return Res; } /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Only works one way, because it needs an immediate operand. if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; // First try with the default operand order. if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) return Result; // If that didn't work, try again with the operands commuted. return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget); } /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB. /// static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c)) if (N1.getNode()->hasOneUse()) if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI)) return Result; return SDValue(); } /// PerformVMULCombine /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the /// special multiplier accumulator forwarding. /// vmul d3, d0, d2 /// vmla d3, d1, d2 /// is faster than /// vadd d3, d0, d1 /// vmul d3, d3, d2 // However, for (A + B) * (A + B), // vadd d2, d0, d1 // vmul d3, d0, d2 // vmla d3, d1, d2 // is slower than // vadd d2, d0, d1 // vmul d3, d2, d2 static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (!Subtarget->hasVMLxForwarding()) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); unsigned Opcode = N0.getOpcode(); if (Opcode != ISD::ADD && Opcode != ISD::SUB && Opcode != ISD::FADD && Opcode != ISD::FSUB) { Opcode = N1.getOpcode(); if (Opcode != ISD::ADD && Opcode != ISD::SUB && Opcode != ISD::FADD && Opcode != ISD::FSUB) return SDValue(); std::swap(N0, N1); } if (N0 == N1) return SDValue(); EVT VT = N->getValueType(0); SDLoc DL(N); SDValue N00 = N0->getOperand(0); SDValue N01 = N0->getOperand(1); return DAG.getNode(Opcode, DL, VT, DAG.getNode(ISD::MUL, DL, VT, N00, N1), DAG.getNode(ISD::MUL, DL, VT, N01, N1)); } static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; if (Subtarget->isThumb1Only()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); EVT VT = N->getValueType(0); if (VT.is64BitVector() || VT.is128BitVector()) return PerformVMULCombine(N, DCI, Subtarget); if (VT != MVT::i32) return SDValue(); ConstantSDNode *C = dyn_cast(N->getOperand(1)); if (!C) return SDValue(); int64_t MulAmt = C->getSExtValue(); unsigned ShiftAmt = countTrailingZeros(MulAmt); ShiftAmt = ShiftAmt & (32 - 1); SDValue V = N->getOperand(0); SDLoc DL(N); SDValue Res; MulAmt >>= ShiftAmt; if (MulAmt >= 0) { if (isPowerOf2_32(MulAmt - 1)) { // (mul x, 2^N + 1) => (add (shl x, N), x) Res = DAG.getNode(ISD::ADD, DL, VT, V, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmt - 1), DL, MVT::i32))); } else if (isPowerOf2_32(MulAmt + 1)) { // (mul x, 2^N - 1) => (sub (shl x, N), x) Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmt + 1), DL, MVT::i32)), V); } else return SDValue(); } else { uint64_t MulAmtAbs = -MulAmt; if (isPowerOf2_32(MulAmtAbs + 1)) { // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) Res = DAG.getNode(ISD::SUB, DL, VT, V, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmtAbs + 1), DL, MVT::i32))); } else if (isPowerOf2_32(MulAmtAbs - 1)) { // (mul x, -(2^N + 1)) => - (add (shl x, N), x) Res = DAG.getNode(ISD::ADD, DL, VT, V, DAG.getNode(ISD::SHL, DL, VT, V, DAG.getConstant(Log2_32(MulAmtAbs - 1), DL, MVT::i32))); Res = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, MVT::i32), Res); } else return SDValue(); } if (ShiftAmt != 0) Res = DAG.getNode(ISD::SHL, DL, VT, Res, DAG.getConstant(ShiftAmt, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); return SDValue(); } static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Allow DAGCombine to pattern-match before we touch the canonical form. if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); if (N->getValueType(0) != MVT::i32) return SDValue(); ConstantSDNode *N1C = dyn_cast(N->getOperand(1)); if (!N1C) return SDValue(); uint32_t C1 = (uint32_t)N1C->getZExtValue(); // Don't transform uxtb/uxth. if (C1 == 255 || C1 == 65535) return SDValue(); SDNode *N0 = N->getOperand(0).getNode(); if (!N0->hasOneUse()) return SDValue(); if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL) return SDValue(); bool LeftShift = N0->getOpcode() == ISD::SHL; ConstantSDNode *N01C = dyn_cast(N0->getOperand(1)); if (!N01C) return SDValue(); uint32_t C2 = (uint32_t)N01C->getZExtValue(); if (!C2 || C2 >= 32) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); // We have a pattern of the form "(and (shl x, c2) c1)" or // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to // transform to a pair of shifts, to save materializing c1. // First pattern: right shift, and c1+1 is a power of two. // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power // of two). // FIXME: Use demanded bits? if (!LeftShift && isMask_32(C1)) { uint32_t C3 = countLeadingZeros(C1); if (C2 < C3) { SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), DAG.getConstant(C3 - C2, DL, MVT::i32)); return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, DAG.getConstant(C3, DL, MVT::i32)); } } // Second pattern: left shift, and (c1>>c2)+1 is a power of two. // FIXME: Also check reversed pattern (right shift, and ~(c1<> C3)) { SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), DAG.getConstant(C2 + C3, DL, MVT::i32)); return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, DAG.getConstant(C3, DL, MVT::i32)); } } // FIXME: Transform "(and (shl x, c2) c1)" -> // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than // c1. return SDValue(); } static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Attempt to use immediate-form VBIC BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VbicVT; SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VbicVT, VT.is128BitVector(), OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0)); SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vbic); } } } if (!Subtarget->isThumb1Only()) { // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI)) return Result; if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; } if (Subtarget->isThumb1Only()) if (SDValue Result = CombineANDShift(N, DCI, Subtarget)) return Result; return SDValue(); } // Try combining OR nodes to SMULWB, SMULWT. static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { if (!Subtarget->hasV6Ops() || (Subtarget->isThumb() && (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) return SDValue(); SDValue SRL = OR->getOperand(0); SDValue SHL = OR->getOperand(1); if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) { SRL = OR->getOperand(1); SHL = OR->getOperand(0); } if (!isSRL16(SRL) || !isSHL16(SHL)) return SDValue(); // The first operands to the shifts need to be the two results from the // same smul_lohi node. if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) || SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI) return SDValue(); SDNode *SMULLOHI = SRL.getOperand(0).getNode(); if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) || SHL.getOperand(0) != SDValue(SMULLOHI, 1)) return SDValue(); // Now we have: // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16))) // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments. // For SMUWB the 16-bit value will signed extended somehow. // For SMULWT only the SRA is required. // Check both sides of SMUL_LOHI SDValue OpS16 = SMULLOHI->getOperand(0); SDValue OpS32 = SMULLOHI->getOperand(1); SelectionDAG &DAG = DCI.DAG; if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) { OpS16 = OpS32; OpS32 = SMULLOHI->getOperand(0); } SDLoc dl(OR); unsigned Opcode = 0; if (isS16(OpS16, DAG)) Opcode = ARMISD::SMULWB; else if (isSRA16(OpS16)) { Opcode = ARMISD::SMULWT; OpS16 = OpS16->getOperand(0); } else return SDValue(); SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16); DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res); return SDValue(OR, 0); } static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // BFI is only available on V6T2+ if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) return SDValue(); EVT VT = N->getValueType(0); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); // 1) or (and A, mask), val => ARMbfi A, val, mask // iff (val & mask) == val // // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2) // && mask == ~mask2 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2) // && ~mask == mask2 // (i.e., copy a bitfield value into another bitfield of the same width) if (VT != MVT::i32) return SDValue(); SDValue N00 = N0.getOperand(0); // The value and the mask need to be constants so we can verify this is // actually a bitfield set. If the mask is 0xffff, we can do better // via a movt instruction, so don't use BFI in that case. SDValue MaskOp = N0.getOperand(1); ConstantSDNode *MaskC = dyn_cast(MaskOp); if (!MaskC) return SDValue(); unsigned Mask = MaskC->getZExtValue(); if (Mask == 0xffff) return SDValue(); SDValue Res; // Case (1): or (and A, mask), val => ARMbfi A, val, mask ConstantSDNode *N1C = dyn_cast(N1); if (N1C) { unsigned Val = N1C->getZExtValue(); if ((Val & ~Mask) != Val) return SDValue(); if (ARM::isBitFieldInvertedMask(Mask)) { Val >>= countTrailingZeros(~Mask); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, DAG.getConstant(Val, DL, MVT::i32), DAG.getConstant(Mask, DL, MVT::i32)); DCI.CombineTo(N, Res, false); // Return value from the original node to inform the combiner than N is // now dead. return SDValue(N, 0); } } else if (N1.getOpcode() == ISD::AND) { // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N11C) return SDValue(); unsigned Mask2 = N11C->getZExtValue(); // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern // as is to match. if (ARM::isBitFieldInvertedMask(Mask) && (Mask == ~Mask2)) { // The pack halfword instruction works better for masks that fit it, // so use that when it's available. if (Subtarget->hasDSP() && (Mask == 0xffff || Mask == 0xffff0000)) return SDValue(); // 2a unsigned amt = countTrailingZeros(Mask2); Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0), DAG.getConstant(amt, DL, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res, DAG.getConstant(Mask, DL, MVT::i32)); DCI.CombineTo(N, Res, false); // Return value from the original node to inform the combiner than N is // now dead. return SDValue(N, 0); } else if (ARM::isBitFieldInvertedMask(~Mask) && (~Mask == Mask2)) { // The pack halfword instruction works better for masks that fit it, // so use that when it's available. if (Subtarget->hasDSP() && (Mask2 == 0xffff || Mask2 == 0xffff0000)) return SDValue(); // 2b unsigned lsb = countTrailingZeros(Mask); Res = DAG.getNode(ISD::SRL, DL, VT, N00, DAG.getConstant(lsb, DL, MVT::i32)); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res, DAG.getConstant(Mask2, DL, MVT::i32)); DCI.CombineTo(N, Res, false); // Return value from the original node to inform the combiner than N is // now dead. return SDValue(N, 0); } } if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) && N00.getOpcode() == ISD::SHL && isa(N00.getOperand(1)) && ARM::isBitFieldInvertedMask(~Mask)) { // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask // where lsb(mask) == #shamt and masked bits of B are known zero. SDValue ShAmt = N00.getOperand(1); unsigned ShAmtC = cast(ShAmt)->getZExtValue(); unsigned LSB = countTrailingZeros(Mask); if (ShAmtC != LSB) return SDValue(); Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0), DAG.getConstant(~Mask, DL, MVT::i32)); DCI.CombineTo(N, Res, false); // Return value from the original node to inform the combiner than N is // now dead. return SDValue(N, 0); } return SDValue(); } /// PerformORCombine - Target-specific dag combine xforms for ISD::OR static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // Attempt to use immediate-form VORR BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); SDLoc dl(N); EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (BVN && Subtarget->hasNEON() && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VorrVT; SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VorrVT, VT.is128BitVector(), OtherModImm); if (Val.getNode()) { SDValue Input = DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); } } } if (!Subtarget->isThumb1Only()) { // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) return Result; if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) return Result; } SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) { // The code below optimizes (or (and X, Y), Z). // The AND operand needs to have a single user to make these optimizations // profitable. if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) return SDValue(); APInt SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; APInt SplatBits0, SplatBits1; BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(1)); BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); // Ensure that the second operand of both ands are constants if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, HasAnyUndefs) && !HasAnyUndefs) { if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, HasAnyUndefs) && !HasAnyUndefs) { // Ensure that the bit width of the constants are the same and that // the splat arguments are logical inverses as per the pattern we // are trying to simplify. if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && SplatBits0 == ~SplatBits1) { // Canonicalize the vector type to make instruction selection // simpler. EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, N0->getOperand(1), N0->getOperand(0), N1->getOperand(0)); return DAG.getNode(ISD::BITCAST, dl, VT, Result); } } } } // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when // reasonable. if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget)) return Res; } if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; return SDValue(); } static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { EVT VT = N->getValueType(0); SelectionDAG &DAG = DCI.DAG; if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); if (!Subtarget->isThumb1Only()) { // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c)) if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) return Result; if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget)) return Result; } return SDValue(); } // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it, // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and // their position in "to" (Rd). static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) { assert(N->getOpcode() == ARMISD::BFI); SDValue From = N->getOperand(1); ToMask = ~cast(N->getOperand(2))->getAPIntValue(); FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation()); // If the Base came from a SHR #C, we can deduce that it is really testing bit // #C in the base of the SHR. if (From->getOpcode() == ISD::SRL && isa(From->getOperand(1))) { APInt Shift = cast(From->getOperand(1))->getAPIntValue(); assert(Shift.getLimitedValue() < 32 && "Shift too large!"); FromMask <<= Shift.getLimitedValue(31); From = From->getOperand(0); } return From; } // If A and B contain one contiguous set of bits, does A | B == A . B? // // Neither A nor B must be zero. static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) { unsigned LastActiveBitInA = A.countTrailingZeros(); unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1; return LastActiveBitInA - 1 == FirstActiveBitInB; } static SDValue FindBFIToCombineWith(SDNode *N) { // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with, // if one exists. APInt ToMask, FromMask; SDValue From = ParseBFI(N, ToMask, FromMask); SDValue To = N->getOperand(0); // Now check for a compatible BFI to merge with. We can pass through BFIs that // aren't compatible, but not if they set the same bit in their destination as // we do (or that of any BFI we're going to combine with). SDValue V = To; APInt CombinedToMask = ToMask; while (V.getOpcode() == ARMISD::BFI) { APInt NewToMask, NewFromMask; SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask); if (NewFrom != From) { // This BFI has a different base. Keep going. CombinedToMask |= NewToMask; V = V.getOperand(0); continue; } // Do the written bits conflict with any we've seen so far? if ((NewToMask & CombinedToMask).getBoolValue()) // Conflicting bits - bail out because going further is unsafe. return SDValue(); // Are the new bits contiguous when combined with the old bits? if (BitsProperlyConcatenate(ToMask, NewToMask) && BitsProperlyConcatenate(FromMask, NewFromMask)) return V; if (BitsProperlyConcatenate(NewToMask, ToMask) && BitsProperlyConcatenate(NewFromMask, FromMask)) return V; // We've seen a write to some bits, so track it. CombinedToMask |= NewToMask; // Keep going... V = V.getOperand(0); } return SDValue(); } static SDValue PerformBFICombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue N1 = N->getOperand(1); if (N1.getOpcode() == ISD::AND) { // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff // the bits being cleared by the AND are not demanded by the BFI. ConstantSDNode *N11C = dyn_cast(N1.getOperand(1)); if (!N11C) return SDValue(); unsigned InvMask = cast(N->getOperand(2))->getZExtValue(); unsigned LSB = countTrailingZeros(~InvMask); unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB; assert(Width < static_cast(std::numeric_limits::digits) && "undefined behavior"); unsigned Mask = (1u << Width) - 1; unsigned Mask2 = N11C->getZExtValue(); if ((Mask & (~Mask2)) == 0) return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0), N->getOperand(0), N1.getOperand(0), N->getOperand(2)); } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) { // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes. // Keep track of any consecutive bits set that all come from the same base // value. We can combine these together into a single BFI. SDValue CombineBFI = FindBFIToCombineWith(N); if (CombineBFI == SDValue()) return SDValue(); // We've found a BFI. APInt ToMask1, FromMask1; SDValue From1 = ParseBFI(N, ToMask1, FromMask1); APInt ToMask2, FromMask2; SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2); assert(From1 == From2); (void)From2; // First, unlink CombineBFI. DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0)); // Then create a new BFI, combining the two together. APInt NewFromMask = FromMask1 | FromMask2; APInt NewToMask = ToMask1 | ToMask2; EVT VT = N->getValueType(0); SDLoc dl(N); if (NewFromMask[0] == 0) From1 = DCI.DAG.getNode( ISD::SRL, dl, VT, From1, DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT)); return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1, DCI.DAG.getConstant(~NewToMask, dl, VT)); } return SDValue(); } /// PerformVMOVRRDCombine - Target-specific dag combine xforms for /// ARMISD::VMOVRRD. static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // vmovrrd(vmovdrr x, y) -> x,y SDValue InDouble = N->getOperand(0); if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP()) return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1)); // vmovrrd(load f64) -> (load i32), (load i32) SDNode *InNode = InDouble.getNode(); if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() && InNode->getValueType(0) == MVT::f64 && InNode->getOperand(1).getOpcode() == ISD::FrameIndex && !cast(InNode)->isVolatile()) { // TODO: Should this be done for non-FrameIndex operands? LoadSDNode *LD = cast(InNode); SelectionDAG &DAG = DCI.DAG; SDLoc DL(LD); SDValue BasePtr = LD->getBasePtr(); SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(), LD->getAlignment(), LD->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); SDValue NewLD2 = DAG.getLoad( MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(), std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags()); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1)); if (DCI.DAG.getDataLayout().isBigEndian()) std::swap (NewLD1, NewLD2); SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2); return Result; } return SDValue(); } /// PerformVMOVDRRCombine - Target-specific dag combine xforms for /// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands. static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) { // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X) SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); if (Op0.getOpcode() == ISD::BITCAST) Op0 = Op0.getOperand(0); if (Op1.getOpcode() == ISD::BITCAST) Op1 = Op1.getOperand(0); if (Op0.getOpcode() == ARMISD::VMOVRRD && Op0.getNode() == Op1.getNode() && Op0.getResNo() == 0 && Op1.getResNo() == 1) return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0.getOperand(0)); return SDValue(); } /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node /// are normal, non-volatile loads. If so, it is profitable to bitcast an /// i64 vector to have f64 elements, since the value can then be loaded /// directly into a VFP register. static bool hasNormalLoadOperand(SDNode *N) { unsigned NumElts = N->getValueType(0).getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { SDNode *Elt = N->getOperand(i).getNode(); if (ISD::isNormalLoad(Elt) && !cast(Elt)->isVolatile()) return true; } return false; } /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for /// ISD::BUILD_VECTOR. static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X): // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value // into a pair of GPRs, which is fine when the value is used as a scalar, // but if the i64 value is converted to a vector, we need to undo the VMOVRRD. SelectionDAG &DAG = DCI.DAG; if (N->getNumOperands() == 2) if (SDValue RV = PerformVMOVDRRCombine(N, DAG)) return RV; // Load i64 elements as f64 values so that type legalization does not split // them up into i32 values. EVT VT = N->getValueType(0); if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N)) return SDValue(); SDLoc dl(N); SmallVector Ops; unsigned NumElts = VT.getVectorNumElements(); for (unsigned i = 0; i < NumElts; ++i) { SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i)); Ops.push_back(V); // Make the DAGCombiner fold the bitcast. DCI.AddToWorklist(V.getNode()); } EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts); SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops); return DAG.getNode(ISD::BITCAST, dl, VT, BV); } /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR. static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR. // At that time, we may have inserted bitcasts from integer to float. // If these bitcasts have survived DAGCombine, change the lowering of this // BUILD_VECTOR in something more vector friendly, i.e., that does not // force to use floating point types. // Make sure we can change the type of the vector. // This is possible iff: // 1. The vector is only used in a bitcast to a integer type. I.e., // 1.1. Vector is used only once. // 1.2. Use is a bit convert to an integer type. // 2. The size of its operands are 32-bits (64-bits are not legal). EVT VT = N->getValueType(0); EVT EltVT = VT.getVectorElementType(); // Check 1.1. and 2. if (EltVT.getSizeInBits() != 32 || !N->hasOneUse()) return SDValue(); // By construction, the input type must be float. assert(EltVT == MVT::f32 && "Unexpected type!"); // Check 1.2. SDNode *Use = *N->use_begin(); if (Use->getOpcode() != ISD::BITCAST || Use->getValueType(0).isFloatingPoint()) return SDValue(); // Check profitability. // Model is, if more than half of the relevant operands are bitcast from // i32, turn the build_vector into a sequence of insert_vector_elt. // Relevant operands are everything that is not statically // (i.e., at compile time) bitcasted. unsigned NumOfBitCastedElts = 0; unsigned NumElts = VT.getVectorNumElements(); unsigned NumOfRelevantElts = NumElts; for (unsigned Idx = 0; Idx < NumElts; ++Idx) { SDValue Elt = N->getOperand(Idx); if (Elt->getOpcode() == ISD::BITCAST) { // Assume only bit cast to i32 will go away. if (Elt->getOperand(0).getValueType() == MVT::i32) ++NumOfBitCastedElts; } else if (Elt.isUndef() || isa(Elt)) // Constants are statically casted, thus do not count them as // relevant operands. --NumOfRelevantElts; } // Check if more than half of the elements require a non-free bitcast. if (NumOfBitCastedElts <= NumOfRelevantElts / 2) return SDValue(); SelectionDAG &DAG = DCI.DAG; // Create the new vector type. EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts); // Check if the type is legal. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isTypeLegal(VecVT)) return SDValue(); // Combine: // ARMISD::BUILD_VECTOR E1, E2, ..., EN. // => BITCAST INSERT_VECTOR_ELT // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1), // (BITCAST EN), N. SDValue Vec = DAG.getUNDEF(VecVT); SDLoc dl(N); for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) { SDValue V = N->getOperand(Idx); if (V.isUndef()) continue; if (V.getOpcode() == ISD::BITCAST && V->getOperand(0).getValueType() == MVT::i32) // Fold obvious case. V = V.getOperand(0); else { V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(V.getNode()); } SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx); } Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); return Vec; } /// PerformInsertEltCombine - Target-specific dag combine xforms for /// ISD::INSERT_VECTOR_ELT. static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { // Bitcast an i64 load inserted into a vector to f64. // Otherwise, the i64 value will be legalized to a pair of i32 values. EVT VT = N->getValueType(0); SDNode *Elt = N->getOperand(1).getNode(); if (VT.getVectorElementType() != MVT::i64 || !ISD::isNormalLoad(Elt) || cast(Elt)->isVolatile()) return SDValue(); SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VT.getVectorNumElements()); SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0)); SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1)); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); DCI.AddToWorklist(V.getNode()); SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT, Vec, V, N->getOperand(2)); return DAG.getNode(ISD::BITCAST, dl, VT, InsElt); } /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for /// ISD::VECTOR_SHUFFLE. static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) { // The LLVM shufflevector instruction does not require the shuffle mask // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the // operands do not match the mask length, they are extended by concatenating // them with undef vectors. That is probably the right thing for other // targets, but for NEON it is better to concatenate two double-register // size vector operands into a single quad-register size vector. Do that // transformation here: // shuffle(concat(v1, undef), concat(v2, undef)) -> // shuffle(concat(v1, v2), undef) SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); if (Op0.getOpcode() != ISD::CONCAT_VECTORS || Op1.getOpcode() != ISD::CONCAT_VECTORS || Op0.getNumOperands() != 2 || Op1.getNumOperands() != 2) return SDValue(); SDValue Concat0Op1 = Op0.getOperand(1); SDValue Concat1Op1 = Op1.getOperand(1); if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef()) return SDValue(); // Skip the transformation if any of the types are illegal. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT VT = N->getValueType(0); if (!TLI.isTypeLegal(VT) || !TLI.isTypeLegal(Concat0Op1.getValueType()) || !TLI.isTypeLegal(Concat1Op1.getValueType())) return SDValue(); SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Op0.getOperand(0), Op1.getOperand(0)); // Translate the shuffle mask. SmallVector NewMask; unsigned NumElts = VT.getVectorNumElements(); unsigned HalfElts = NumElts/2; ShuffleVectorSDNode *SVN = cast(N); for (unsigned n = 0; n < NumElts; ++n) { int MaskElt = SVN->getMaskElt(n); int NewElt = -1; if (MaskElt < (int)HalfElts) NewElt = MaskElt; else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts)) NewElt = HalfElts + MaskElt - NumElts; NewMask.push_back(NewElt); } return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat, DAG.getUNDEF(VT), NewMask); } /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, /// NEON load/store intrinsics, and generic vector load/stores, to merge /// base address updates. /// For generic load/stores, the memory type is assumed to be a vector. /// The caller is assumed to have checked legality. static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID || N->getOpcode() == ISD::INTRINSIC_W_CHAIN); const bool isStore = N->getOpcode() == ISD::STORE; const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); SDValue Addr = N->getOperand(AddrOpIdx); MemSDNode *MemN = cast(N); SDLoc dl(N); // Search for a use of the address operand that is an increment. for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = Addr.getNode()->use_end(); UI != UE; ++UI) { SDNode *User = *UI; if (User->getOpcode() != ISD::ADD || UI.getUse().getResNo() != Addr.getResNo()) continue; // Check that the add is independent of the load/store. Otherwise, folding // it would create a cycle. if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) continue; // Find the new opcode for the updating load/store. bool isLoadOp = true; bool isLaneOp = false; unsigned NewOpc = 0; unsigned NumVecs = 0; if (isIntrinsic) { unsigned IntNo = cast(N->getOperand(1))->getZExtValue(); switch (IntNo) { default: llvm_unreachable("unexpected intrinsic for Neon base update"); case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD; NumVecs = 1; break; case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD; NumVecs = 2; break; case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD; NumVecs = 3; break; case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD; NumVecs = 4; break; case Intrinsic::arm_neon_vld2dup: case Intrinsic::arm_neon_vld3dup: case Intrinsic::arm_neon_vld4dup: // TODO: Support updating VLDxDUP nodes. For now, we just skip // combining base updates for such intrinsics. continue; case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD; NumVecs = 2; isLaneOp = true; break; case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD; NumVecs = 3; isLaneOp = true; break; case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD; NumVecs = 4; isLaneOp = true; break; case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD; NumVecs = 1; isLoadOp = false; break; case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD; NumVecs = 2; isLoadOp = false; break; case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD; NumVecs = 3; isLoadOp = false; break; case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD; NumVecs = 4; isLoadOp = false; break; case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD; NumVecs = 2; isLoadOp = false; isLaneOp = true; break; case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD; NumVecs = 3; isLoadOp = false; isLaneOp = true; break; case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD; NumVecs = 4; isLoadOp = false; isLaneOp = true; break; } } else { isLaneOp = true; switch (N->getOpcode()) { default: llvm_unreachable("unexpected opcode for Neon base update"); case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break; case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break; case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break; case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break; case ISD::LOAD: NewOpc = ARMISD::VLD1_UPD; NumVecs = 1; isLaneOp = false; break; case ISD::STORE: NewOpc = ARMISD::VST1_UPD; NumVecs = 1; isLaneOp = false; isLoadOp = false; break; } } // Find the size of memory referenced by the load/store. EVT VecTy; if (isLoadOp) { VecTy = N->getValueType(0); } else if (isIntrinsic) { VecTy = N->getOperand(AddrOpIdx+1).getValueType(); } else { assert(isStore && "Node has to be a load, a store, or an intrinsic!"); VecTy = N->getOperand(1).getValueType(); } unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; if (isLaneOp) NumBytes /= VecTy.getVectorNumElements(); // If the increment is a constant, it must match the memory ref size. SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); ConstantSDNode *CInc = dyn_cast(Inc.getNode()); if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) { // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two // separate instructions that make it harder to use a non-constant update. continue; } // OK, we found an ADD we can fold into the base update. // Now, create a _UPD node, taking care of not breaking alignment. EVT AlignedVecTy = VecTy; unsigned Alignment = MemN->getAlignment(); // If this is a less-than-standard-aligned load/store, change the type to // match the standard alignment. // The alignment is overlooked when selecting _UPD variants; and it's // easier to introduce bitcasts here than fix that. // There are 3 ways to get to this base-update combine: // - intrinsics: they are assumed to be properly aligned (to the standard // alignment of the memory type), so we don't need to do anything. // - ARMISD::VLDx nodes: they are only generated from the aforementioned // intrinsics, so, likewise, there's nothing to do. // - generic load/store instructions: the alignment is specified as an // explicit operand, rather than implicitly as the standard alignment // of the memory type (like the intrisics). We need to change the // memory type to match the explicit alignment. That way, we don't // generate non-standard-aligned ARMISD::VLDx nodes. if (isa(N)) { if (Alignment == 0) Alignment = 1; if (Alignment < VecTy.getScalarSizeInBits() / 8) { MVT EltTy = MVT::getIntegerVT(Alignment * 8); assert(NumVecs == 1 && "Unexpected multi-element generic load/store."); assert(!isLaneOp && "Unexpected generic load/store lane."); unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8); AlignedVecTy = MVT::getVectorVT(EltTy, NumElts); } // Don't set an explicit alignment on regular load/stores that we want // to transform to VLD/VST 1_UPD nodes. // This matches the behavior of regular load/stores, which only get an // explicit alignment if the MMO alignment is larger than the standard // alignment of the memory type. // Intrinsics, however, always get an explicit alignment, set to the // alignment of the MMO. Alignment = 1; } // Create the new updating load/store node. // First, create an SDVTList for the new updating node's results. EVT Tys[6]; unsigned NumResultVecs = (isLoadOp ? NumVecs : 0); unsigned n; for (n = 0; n < NumResultVecs; ++n) Tys[n] = AlignedVecTy; Tys[n++] = MVT::i32; Tys[n] = MVT::Other; SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2)); // Then, gather the new node's operands. SmallVector Ops; Ops.push_back(N->getOperand(0)); // incoming chain Ops.push_back(N->getOperand(AddrOpIdx)); Ops.push_back(Inc); if (StoreSDNode *StN = dyn_cast(N)) { // Try to match the intrinsic's signature Ops.push_back(StN->getValue()); } else { // Loads (and of course intrinsics) match the intrinsics' signature, // so just add all but the alignment operand. for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i) Ops.push_back(N->getOperand(i)); } // For all node types, the alignment operand is always the last one. Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32)); // If this is a non-standard-aligned STORE, the penultimate operand is the // stored value. Bitcast it to the aligned type. if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) { SDValue &StVal = Ops[Ops.size()-2]; StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal); } EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy; SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT, MemN->getMemOperand()); // Update the uses. SmallVector NewResults; for (unsigned i = 0; i < NumResultVecs; ++i) NewResults.push_back(SDValue(UpdN.getNode(), i)); // If this is an non-standard-aligned LOAD, the first result is the loaded // value. Bitcast it to the expected result type. if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) { SDValue &LdVal = NewResults[0]; LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal); } NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain DCI.CombineTo(N, NewResults); DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); break; } return SDValue(); } static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) return SDValue(); return CombineBaseUpdate(N, DCI); } /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic /// are also VDUPLANEs. If so, combine them to a vldN-dup operation and /// return true. static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); // vldN-dup instructions only support 64-bit vectors for N > 1. if (!VT.is64BitVector()) return false; // Check if the VDUPLANE operand is a vldN-dup intrinsic. SDNode *VLD = N->getOperand(0).getNode(); if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN) return false; unsigned NumVecs = 0; unsigned NewOpc = 0; unsigned IntNo = cast(VLD->getOperand(1))->getZExtValue(); if (IntNo == Intrinsic::arm_neon_vld2lane) { NumVecs = 2; NewOpc = ARMISD::VLD2DUP; } else if (IntNo == Intrinsic::arm_neon_vld3lane) { NumVecs = 3; NewOpc = ARMISD::VLD3DUP; } else if (IntNo == Intrinsic::arm_neon_vld4lane) { NumVecs = 4; NewOpc = ARMISD::VLD4DUP; } else { return false; } // First check that all the vldN-lane uses are VDUPLANEs and that the lane // numbers match the load. unsigned VLDLaneNo = cast(VLD->getOperand(NumVecs+3))->getZExtValue(); for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); UI != UE; ++UI) { // Ignore uses of the chain result. if (UI.getUse().getResNo() == NumVecs) continue; SDNode *User = *UI; if (User->getOpcode() != ARMISD::VDUPLANE || VLDLaneNo != cast(User->getOperand(1))->getZExtValue()) return false; } // Create the vldN-dup node. EVT Tys[5]; unsigned n; for (n = 0; n < NumVecs; ++n) Tys[n] = VT; Tys[n] = MVT::Other; SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1)); SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) }; MemIntrinsicSDNode *VLDMemInt = cast(VLD); SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, VLDMemInt->getMemoryVT(), VLDMemInt->getMemOperand()); // Update the uses. for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end(); UI != UE; ++UI) { unsigned ResNo = UI.getUse().getResNo(); // Ignore uses of the chain result. if (ResNo == NumVecs) continue; SDNode *User = *UI; DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo)); } // Now the vldN-lane intrinsic is dead except for its chain result. // Update uses of the chain. std::vector VLDDupResults; for (unsigned n = 0; n < NumVecs; ++n) VLDDupResults.push_back(SDValue(VLDDup.getNode(), n)); VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs)); DCI.CombineTo(VLD, VLDDupResults); return true; } /// PerformVDUPLANECombine - Target-specific dag combine xforms for /// ARMISD::VDUPLANE. static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SDValue Op = N->getOperand(0); // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation. if (CombineVLDDUP(N, DCI)) return SDValue(N, 0); // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is // redundant. Ignore bit_converts for now; element sizes are checked below. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM) return SDValue(); // Make sure the VMOV element size is not bigger than the VDUPLANE elements. unsigned EltSize = Op.getScalarValueSizeInBits(); // The canonical VMOV for a zero vector uses a 32-bit element size. unsigned Imm = cast(Op.getOperand(0))->getZExtValue(); unsigned EltBits; if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0) EltSize = 8; EVT VT = N->getValueType(0); if (EltSize > VT.getScalarSizeInBits()) return SDValue(); return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op); } /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP. static SDValue PerformVDUPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; SDValue Op = N->getOperand(0); // Match VDUP(LOAD) -> VLD1DUP. // We match this pattern here rather than waiting for isel because the // transform is only legal for unindexed loads. LoadSDNode *LD = dyn_cast(Op.getNode()); if (LD && Op.hasOneUse() && LD->isUnindexed() && LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) { SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1), DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) }; SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other); SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys, Ops, LD->getMemoryVT(), LD->getMemOperand()); DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1)); return VLDDup; } return SDValue(); } static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { EVT VT = N->getValueType(0); // If this is a legal vector load, try to combine it into a VLD1_UPD. if (ISD::isNormalLoad(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); return SDValue(); } /// PerformSTORECombine - Target-specific dag combine xforms for /// ISD::STORE. static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { StoreSDNode *St = cast(N); if (St->isVolatile()) return SDValue(); // Optimize trunc store (of multiple scalars) to shuffle and store. First, // pack all of the elements in one place. Next, store to memory in fewer // chunks. SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); if (St->isTruncatingStore() && VT.isVector()) { SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT StVT = St->getMemoryVT(); unsigned NumElems = VT.getVectorNumElements(); assert(StVT != VT && "Cannot truncate to the same type"); unsigned FromEltSz = VT.getScalarSizeInBits(); unsigned ToEltSz = StVT.getScalarSizeInBits(); // From, To sizes and ElemCount must be pow of two if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); // We are going to use the original vector elt for storing. // Accumulated smaller vector elements must be a multiple of the store size. if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); unsigned SizeRatio = FromEltSz / ToEltSz; assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); // Create a type on which we perform the shuffle. EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), NumElems*SizeRatio); assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); SDLoc DL(St); SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); SmallVector ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 : i * SizeRatio; // Can't shuffle using an illegal type. if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); // At this point all of the data is stored at the bottom of the // register. We now need to save it to mem. // Find the largest store unit MVT StoreType = MVT::i8; for (MVT Tp : MVT::integer_valuetypes()) { if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) StoreType = Tp; } // Didn't find a legal store type. if (!TLI.isTypeLegal(StoreType)) return SDValue(); // Bitcast the original vector into a vector of store-size units EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); SmallVector Chains; SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, TLI.getPointerTy(DAG.getDataLayout())); SDValue BasePtr = St->getBasePtr(); // Perform one or more big stores into memory. unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); for (unsigned I = 0; I < E; I++) { SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, ShuffWide, DAG.getIntPtrConstant(I, DL)); SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); Chains.push_back(Ch); } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } if (!ISD::isNormalStore(St)) return SDValue(); // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and // ARM stores of arguments in the same cache line. if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR && StVal.getNode()->hasOneUse()) { SelectionDAG &DAG = DCI.DAG; bool isBigEndian = DAG.getDataLayout().isBigEndian(); SDLoc DL(St); SDValue BasePtr = St->getBasePtr(); SDValue NewST1 = DAG.getStore( St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0), BasePtr, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr, DAG.getConstant(4, DL, MVT::i32)); return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(isBigEndian ? 0 : 1), OffsetPtr, St->getPointerInfo(), std::min(4U, St->getAlignment() / 2), St->getMemOperand()->getFlags()); } if (StVal.getValueType() == MVT::i64 && StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { // Bitcast an i64 store extracted from a vector to f64. // Otherwise, the i64 value will be legalized to a pair of i32 values. SelectionDAG &DAG = DCI.DAG; SDLoc dl(StVal); SDValue IntVec = StVal.getOperand(0); EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, IntVec.getValueType().getVectorNumElements()); SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec); SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Vec, StVal.getOperand(1)); dl = SDLoc(N); SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt); // Make the DAGCombiner fold the bitcasts. DCI.AddToWorklist(Vec.getNode()); DCI.AddToWorklist(ExtElt.getNode()); DCI.AddToWorklist(V.getNode()); return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(), St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags(), St->getAAInfo()); } // If this is a legal vector store, try to combine it into a VST1_UPD. if (ISD::isNormalStore(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); return SDValue(); } /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) /// can replace combinations of VMUL and VCVT (floating-point to integer) /// when the VMUL has a constant operand that is a power of 2. /// /// Example (assume d17 = ): /// vmul.f32 d16, d17, d16 /// vcvt.s32.f32 d16, d16 /// becomes: /// vcvt.s32.f32 d16, d16, #3 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL) return SDValue(); SDValue ConstVec = Op->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT FloatTy = Op.getSimpleValueType().getVectorElementType(); uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { // These instructions only exist converting from f32 to i32. We can handle // smaller integers by generating an extra truncate, but larger ones would // be lossy. We also can't handle more then 4 lanes, since these intructions // only support v2i32/v4i32 types. return SDValue(); } BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); if (C == -1 || C == 0 || C > 32) return SDValue(); SDLoc dl(N); bool isSigned = N->getOpcode() == ISD::FP_TO_SINT; unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs : Intrinsic::arm_neon_vcvtfp2fxu; SDValue FixConv = DAG.getNode( ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0), DAG.getConstant(C, dl, MVT::i32)); if (IntBits < FloatBits) FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv); return FixConv; } /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD) /// can replace combinations of VCVT (integer to floating-point) and VDIV /// when the VDIV has a constant operand that is a power of 2. /// /// Example (assume d17 = ): /// vcvt.f32.s32 d16, d16 /// vdiv.f32 d16, d17, d16 /// becomes: /// vcvt.f32.s32 d16, d16, #3 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { if (!Subtarget->hasNEON()) return SDValue(); SDValue Op = N->getOperand(0); unsigned OpOpcode = Op.getNode()->getOpcode(); if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() || (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP)) return SDValue(); SDValue ConstVec = N->getOperand(1); if (!isa(ConstVec)) return SDValue(); MVT FloatTy = N->getSimpleValueType(0).getVectorElementType(); uint32_t FloatBits = FloatTy.getSizeInBits(); MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType(); uint32_t IntBits = IntTy.getSizeInBits(); unsigned NumLanes = Op.getValueType().getVectorNumElements(); if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) { // These instructions only exist converting from i32 to f32. We can handle // smaller integers by generating an extra extend, but larger ones would // be lossy. We also can't handle more then 4 lanes, since these intructions // only support v2i32/v4i32 types. return SDValue(); } BitVector UndefElements; BuildVectorSDNode *BV = cast(ConstVec); int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33); if (C == -1 || C == 0 || C > 32) return SDValue(); SDLoc dl(N); bool isSigned = OpOpcode == ISD::SINT_TO_FP; SDValue ConvInput = Op.getOperand(0); if (IntBits < FloatBits) ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput); unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp : Intrinsic::arm_neon_vcvtfxu2fp; return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), ConvInput, DAG.getConstant(C, dl, MVT::i32)); } /// Getvshiftimm - Check if this is a valid build_vector for the immediate /// operand of a vector shift operation, where all the elements of the /// build_vector must have the same constant integer value. static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { // Ignore bit_converts. while (Op.getOpcode() == ISD::BITCAST) Op = Op.getOperand(0); BuildVectorSDNode *BVN = dyn_cast(Op.getNode()); APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs, ElementBits) || SplatBitSize > ElementBits) return false; Cnt = SplatBits.getSExtValue(); return true; } /// isVShiftLImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift left operation. That value must be in the range: /// 0 <= Value < ElementBits for a left shift; or /// 0 <= Value <= ElementBits for a long left shift. static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getScalarSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits); } /// isVShiftRImm - Check if this is a valid build_vector for the immediate /// operand of a vector shift right operation. For a shift opcode, the value /// is positive, but for an intrinsic the value count must be negative. The /// absolute value must be in the range: /// 1 <= |Value| <= ElementBits for a right shift; or /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, int64_t &Cnt) { assert(VT.isVector() && "vector shift count is not a vector type"); int64_t ElementBits = VT.getScalarSizeInBits(); if (! getVShiftImm(Op, ElementBits, Cnt)) return false; if (!isIntrinsic) return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits)); if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) { Cnt = -Cnt; return true; } return false; } /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics. static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) { unsigned IntNo = cast(N->getOperand(0))->getZExtValue(); switch (IntNo) { default: // Don't do anything for most intrinsics. break; // Vector shifts: check for immediate versions and lower them. // Note: This is done during DAG combining instead of DAG legalizing because // the build_vectors for 64-bit vector element shift counts are generally // not legal, and it is hard to see their values after they get legalized to // loads from a constant pool. case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: case Intrinsic::arm_neon_vrshifts: case Intrinsic::arm_neon_vrshiftu: case Intrinsic::arm_neon_vrshiftn: case Intrinsic::arm_neon_vqshifts: case Intrinsic::arm_neon_vqshiftu: case Intrinsic::arm_neon_vqshiftsu: case Intrinsic::arm_neon_vqshiftns: case Intrinsic::arm_neon_vqshiftnu: case Intrinsic::arm_neon_vqshiftnsu: case Intrinsic::arm_neon_vqrshiftns: case Intrinsic::arm_neon_vqrshiftnu: case Intrinsic::arm_neon_vqrshiftnsu: { EVT VT = N->getOperand(1).getValueType(); int64_t Cnt; unsigned VShiftOpc = 0; switch (IntNo) { case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) { VShiftOpc = ARMISD::VSHL; break; } if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) { VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRs : ARMISD::VSHRu); break; } return SDValue(); case Intrinsic::arm_neon_vrshifts: case Intrinsic::arm_neon_vrshiftu: if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) break; return SDValue(); case Intrinsic::arm_neon_vqshifts: case Intrinsic::arm_neon_vqshiftu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) break; return SDValue(); case Intrinsic::arm_neon_vqshiftsu: if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) break; llvm_unreachable("invalid shift count for vqshlu intrinsic"); case Intrinsic::arm_neon_vrshiftn: case Intrinsic::arm_neon_vqshiftns: case Intrinsic::arm_neon_vqshiftnu: case Intrinsic::arm_neon_vqshiftnsu: case Intrinsic::arm_neon_vqrshiftns: case Intrinsic::arm_neon_vqrshiftnu: case Intrinsic::arm_neon_vqrshiftnsu: // Narrowing shifts require an immediate right shift. if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt)) break; llvm_unreachable("invalid shift count for narrowing vector shift " "intrinsic"); default: llvm_unreachable("unhandled vector shift"); } switch (IntNo) { case Intrinsic::arm_neon_vshifts: case Intrinsic::arm_neon_vshiftu: // Opcode already set above. break; case Intrinsic::arm_neon_vrshifts: VShiftOpc = ARMISD::VRSHRs; break; case Intrinsic::arm_neon_vrshiftu: VShiftOpc = ARMISD::VRSHRu; break; case Intrinsic::arm_neon_vrshiftn: VShiftOpc = ARMISD::VRSHRN; break; case Intrinsic::arm_neon_vqshifts: VShiftOpc = ARMISD::VQSHLs; break; case Intrinsic::arm_neon_vqshiftu: VShiftOpc = ARMISD::VQSHLu; break; case Intrinsic::arm_neon_vqshiftsu: VShiftOpc = ARMISD::VQSHLsu; break; case Intrinsic::arm_neon_vqshiftns: VShiftOpc = ARMISD::VQSHRNs; break; case Intrinsic::arm_neon_vqshiftnu: VShiftOpc = ARMISD::VQSHRNu; break; case Intrinsic::arm_neon_vqshiftnsu: VShiftOpc = ARMISD::VQSHRNsu; break; case Intrinsic::arm_neon_vqrshiftns: VShiftOpc = ARMISD::VQRSHRNs; break; case Intrinsic::arm_neon_vqrshiftnu: VShiftOpc = ARMISD::VQRSHRNu; break; case Intrinsic::arm_neon_vqrshiftnsu: VShiftOpc = ARMISD::VQRSHRNsu; break; } SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, N->getValueType(0), N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32)); } case Intrinsic::arm_neon_vshiftins: { EVT VT = N->getOperand(1).getValueType(); int64_t Cnt; unsigned VShiftOpc = 0; if (isVShiftLImm(N->getOperand(3), VT, false, Cnt)) VShiftOpc = ARMISD::VSLI; else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt)) VShiftOpc = ARMISD::VSRI; else { llvm_unreachable("invalid shift count for vsli/vsri intrinsic"); } SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, N->getValueType(0), N->getOperand(1), N->getOperand(2), DAG.getConstant(Cnt, dl, MVT::i32)); } case Intrinsic::arm_neon_vqrshifts: case Intrinsic::arm_neon_vqrshiftu: // No immediate versions of these to check for. break; } return SDValue(); } /// PerformShiftCombine - Checks for immediate versions of vector shifts and /// lowers them. As with the vector shift intrinsics, this is done during DAG /// combining instead of DAG legalizing because the build_vectors for 64-bit /// vector element shift counts are generally not legal, and it is hard to see /// their values after they get legalized to loads from a constant pool. static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { EVT VT = N->getValueType(0); if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16. SDValue N1 = N->getOperand(1); if (ConstantSDNode *C = dyn_cast(N1)) { SDValue N0 = N->getOperand(0); if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP && DAG.MaskedValueIsZero(N0.getOperand(0), APInt::getHighBitsSet(32, 16))) return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1); } } // Nothing to be done for scalar shifts. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!VT.isVector() || !TLI.isTypeLegal(VT)) return SDValue(); assert(ST->hasNEON() && "unexpected vector shift"); int64_t Cnt; switch (N->getOpcode()) { default: llvm_unreachable("unexpected shift opcode"); case ISD::SHL: if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) { SDLoc dl(N); return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } break; case ISD::SRA: case ISD::SRL: if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) { unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ? ARMISD::VSHRs : ARMISD::VSHRu); SDLoc dl(N); return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), DAG.getConstant(Cnt, dl, MVT::i32)); } } return SDValue(); } /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST) { SDValue N0 = N->getOperand(0); // Check for sign- and zero-extensions of vector extract operations of 8- // and 16-bit vector elements. NEON supports these directly. They are // handled during DAG combining because type legalization will promote them // to 32-bit types and it is messy to recognize the operations after that. if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { SDValue Vec = N0.getOperand(0); SDValue Lane = N0.getOperand(1); EVT VT = N->getValueType(0); EVT EltVT = N0.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (VT == MVT::i32 && (EltVT == MVT::i8 || EltVT == MVT::i16) && TLI.isTypeLegal(Vec.getValueType()) && isa(Lane)) { unsigned Opc = 0; switch (N->getOpcode()) { default: llvm_unreachable("unexpected opcode"); case ISD::SIGN_EXTEND: Opc = ARMISD::VGETLANEs; break; case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: Opc = ARMISD::VGETLANEu; break; } return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane); } } return SDValue(); } static const APInt *isPowerOf2Constant(SDValue V) { ConstantSDNode *C = dyn_cast(V); if (!C) return nullptr; const APInt *CV = &C->getAPIntValue(); return CV->isPowerOf2() ? CV : nullptr; } SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const { // If we have a CMOV, OR and AND combination such as: // if (x & CN) // y |= CM; // // And: // * CN is a single bit; // * All bits covered by CM are known zero in y // // Then we can convert this into a sequence of BFI instructions. This will // always be a win if CM is a single bit, will always be no worse than the // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is // three bits (due to the extra IT instruction). SDValue Op0 = CMOV->getOperand(0); SDValue Op1 = CMOV->getOperand(1); auto CCNode = cast(CMOV->getOperand(2)); auto CC = CCNode->getAPIntValue().getLimitedValue(); SDValue CmpZ = CMOV->getOperand(4); // The compare must be against zero. if (!isNullConstant(CmpZ->getOperand(1))) return SDValue(); assert(CmpZ->getOpcode() == ARMISD::CMPZ); SDValue And = CmpZ->getOperand(0); if (And->getOpcode() != ISD::AND) return SDValue(); const APInt *AndC = isPowerOf2Constant(And->getOperand(1)); if (!AndC) return SDValue(); SDValue X = And->getOperand(0); if (CC == ARMCC::EQ) { // We're performing an "equal to zero" compare. Swap the operands so we // canonicalize on a "not equal to zero" compare. std::swap(Op0, Op1); } else { assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?"); } if (Op1->getOpcode() != ISD::OR) return SDValue(); ConstantSDNode *OrC = dyn_cast(Op1->getOperand(1)); if (!OrC) return SDValue(); SDValue Y = Op1->getOperand(0); if (Op0 != Y) return SDValue(); // Now, is it profitable to continue? APInt OrCI = OrC->getAPIntValue(); unsigned Heuristic = Subtarget->isThumb() ? 3 : 2; if (OrCI.countPopulation() > Heuristic) return SDValue(); // Lastly, can we determine that the bits defined by OrCI // are zero in Y? KnownBits Known; DAG.computeKnownBits(Y, Known); if ((OrCI & Known.Zero) != OrCI) return SDValue(); // OK, we can do the combine. SDValue V = Y; SDLoc dl(X); EVT VT = X.getValueType(); unsigned BitInX = AndC->logBase2(); if (BitInX != 0) { // We must shift X first. X = DAG.getNode(ISD::SRL, dl, VT, X, DAG.getConstant(BitInX, dl, VT)); } for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits(); BitInY < NumActiveBits; ++BitInY) { if (OrCI[BitInY] == 0) continue; APInt Mask(VT.getSizeInBits(), 0); Mask.setBit(BitInY); V = DAG.getNode(ARMISD::BFI, dl, VT, V, X, // Confusingly, the operand is an *inverted* mask. DAG.getConstant(~Mask, dl, VT)); } return V; } /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. SDValue ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { SDValue Cmp = N->getOperand(4); if (Cmp.getOpcode() != ARMISD::CMPZ) // Only looking at NE cases. return SDValue(); EVT VT = N->getValueType(0); SDLoc dl(N); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); SDValue Chain = N->getOperand(0); SDValue BB = N->getOperand(1); SDValue ARMcc = N->getOperand(2); ARMCC::CondCodes CC = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0)) // -> (brcond Chain BB CC CPSR Cmp) if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() && LHS->getOperand(0)->getOpcode() == ARMISD::CMOV && LHS->getOperand(0)->hasOneUse()) { auto *LHS00C = dyn_cast(LHS->getOperand(0)->getOperand(0)); auto *LHS01C = dyn_cast(LHS->getOperand(0)->getOperand(1)); auto *LHS1C = dyn_cast(LHS->getOperand(1)); auto *RHSC = dyn_cast(RHS); if ((LHS00C && LHS00C->getZExtValue() == 0) && (LHS01C && LHS01C->getZExtValue() == 1) && (LHS1C && LHS1C->getZExtValue() == 1) && (RHSC && RHSC->getZExtValue() == 0)) { return DAG.getNode( ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2), LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4)); } } return SDValue(); } /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV. SDValue ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const { SDValue Cmp = N->getOperand(4); if (Cmp.getOpcode() != ARMISD::CMPZ) // Only looking at EQ and NE cases. return SDValue(); EVT VT = N->getValueType(0); SDLoc dl(N); SDValue LHS = Cmp.getOperand(0); SDValue RHS = Cmp.getOperand(1); SDValue FalseVal = N->getOperand(0); SDValue TrueVal = N->getOperand(1); SDValue ARMcc = N->getOperand(2); ARMCC::CondCodes CC = (ARMCC::CondCodes)cast(ARMcc)->getZExtValue(); // BFI is only available on V6T2+. if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) { SDValue R = PerformCMOVToBFICombine(N, DAG); if (R) return R; } // Simplify // mov r1, r0 // cmp r1, x // mov r0, y // moveq r0, x // to // cmp r0, x // movne r0, y // // mov r1, r0 // cmp r1, x // mov r0, x // movne r0, y // to // cmp r0, x // movne r0, y /// FIXME: Turn this into a target neutral optimization? SDValue Res; if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) { Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, N->getOperand(3), Cmp); } else if (CC == ARMCC::EQ && TrueVal == RHS) { SDValue ARMcc; SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl); Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, N->getOperand(3), NewCmp); } // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0)) // -> (cmov F T CC CPSR Cmp) if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) { auto *LHS0C = dyn_cast(LHS->getOperand(0)); auto *LHS1C = dyn_cast(LHS->getOperand(1)); auto *RHSC = dyn_cast(RHS); if ((LHS0C && LHS0C->getZExtValue() == 0) && (LHS1C && LHS1C->getZExtValue() == 1) && (RHSC && RHSC->getZExtValue() == 0)) { return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, LHS->getOperand(2), LHS->getOperand(3), LHS->getOperand(4)); } } if (!VT.isInteger()) return SDValue(); // Materialize a boolean comparison for integers so we can avoid branching. if (isNullConstant(FalseVal)) { if (CC == ARMCC::EQ && isOneConstant(TrueVal)) { if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) { // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it // right 5 bits will make that 32 be 1, otherwise it will be 0. // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub), DAG.getConstant(5, dl, MVT::i32)); } else { // CMOV 0, 1, ==, (CMPZ x, y) -> // (ADDCARRY (SUB x, y), t:0, t:1) // where t = (SUBCARRY 0, (SUB x, y), 0) // // The SUBCARRY computes 0 - (x - y) and this will give a borrow when // x != y. In other words, a carry C == 1 when x == y, C == 0 // otherwise. // The final ADDCARRY computes // x - y + (0 - (x - y)) + C == C SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); SDVTList VTs = DAG.getVTList(VT, MVT::i32); SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub); // ISD::SUBCARRY returns a borrow but we want the carry here // actually. SDValue Carry = DAG.getNode(ISD::SUB, dl, MVT::i32, DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1)); Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry); } } else if (CC == ARMCC::NE && LHS != RHS && (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) { // This seems pointless but will allow us to combine it further below. // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y) SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc, N->getOperand(3), Cmp); } } else if (isNullConstant(TrueVal)) { if (CC == ARMCC::EQ && LHS != RHS && (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) { // This seems pointless but will allow us to combine it further below // Note that we change == for != as this is the dual for the case above. // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y) SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS); Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal, DAG.getConstant(ARMCC::NE, dl, MVT::i32), N->getOperand(3), Cmp); } } // On Thumb1, the DAG above may be further combined if z is a power of 2 // (z == 2 ^ K). // CMOV (SUB x, y), z, !=, (CMPZ x, y) -> // merge t3, t4 // where t1 = (SUBCARRY (SUB x, y), z, 0) // t2 = (SUBCARRY (SUB x, y), t1:0, t1:1) // t3 = if K != 0 then (SHL t2:0, K) else t2:0 // t4 = (SUB 1, t2:1) [ we want a carry, not a borrow ] const APInt *TrueConst; if (Subtarget->isThumb1Only() && CC == ARMCC::NE && (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) && (FalseVal.getOperand(1) == RHS) && (TrueConst = isPowerOf2Constant(TrueVal))) { SDVTList VTs = DAG.getVTList(VT, MVT::i32); unsigned ShiftAmount = TrueConst->logBase2(); if (ShiftAmount) TrueVal = DAG.getConstant(1, dl, VT); SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal); Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1)); // Make it a carry, not a borrow. SDValue Carry = DAG.getNode( ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1)); Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry); if (ShiftAmount) Res = DAG.getNode(ISD::SHL, dl, VT, Res, DAG.getConstant(ShiftAmount, dl, MVT::i32)); } if (Res.getNode()) { KnownBits Known; DAG.computeKnownBits(SDValue(N,0), Known); // Capture demanded bits information that would be otherwise lost. if (Known.Zero == 0xfffffffe) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i1)); else if (Known.Zero == 0xffffff00) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i8)); else if (Known.Zero == 0xffff0000) Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res, DAG.getValueType(MVT::i16)); } return Res; } SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { default: break; case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget); case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget); case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget); case ISD::SUB: return PerformSUBCombine(N, DCI); case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget); case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); case ISD::STORE: return PerformSTORECombine(N, DCI); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI); case ARMISD::VDUP: return PerformVDUPCombine(N, DCI); case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return PerformVCVTCombine(N, DCI.DAG, Subtarget); case ISD::FDIV: return PerformVDIVCombine(N, DCI.DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG); case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG); case ISD::LOAD: return PerformLOADCombine(N, DCI); case ARMISD::VLD1DUP: case ARMISD::VLD2DUP: case ARMISD::VLD3DUP: case ARMISD::VLD4DUP: return PerformVLDCombine(N, DCI); case ARMISD::BUILD_VECTOR: return PerformARMBUILD_VECTORCombine(N, DCI); case ARMISD::SMULWB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) return SDValue(); break; } case ARMISD::SMULWT: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) return SDValue(); break; } case ARMISD::SMLALBB: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) return SDValue(); break; } case ARMISD::SMLALBT: { unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits(); APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits(); APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) || (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI))) return SDValue(); break; } case ARMISD::SMLALTB: { unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits(); APInt HighMask = APInt::getHighBitsSet(HighWidth, 16); unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits(); APInt LowMask = APInt::getLowBitsSet(LowWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) || (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI))) return SDValue(); break; } case ARMISD::SMLALTT: { unsigned BitWidth = N->getValueType(0).getSizeInBits(); APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16); if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) || (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))) return SDValue(); break; } case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { case Intrinsic::arm_neon_vld1: case Intrinsic::arm_neon_vld1x2: case Intrinsic::arm_neon_vld1x3: case Intrinsic::arm_neon_vld1x4: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: case Intrinsic::arm_neon_vld4: case Intrinsic::arm_neon_vld2lane: case Intrinsic::arm_neon_vld3lane: case Intrinsic::arm_neon_vld4lane: case Intrinsic::arm_neon_vld2dup: case Intrinsic::arm_neon_vld3dup: case Intrinsic::arm_neon_vld4dup: case Intrinsic::arm_neon_vst1: case Intrinsic::arm_neon_vst1x2: case Intrinsic::arm_neon_vst1x3: case Intrinsic::arm_neon_vst1x4: case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: case Intrinsic::arm_neon_vst4: case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: return PerformVLDCombine(N, DCI); default: break; } break; } return SDValue(); } bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const { return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE); } bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned, unsigned, bool *Fast) const { // Depends what it gets converted into if the type is weird. if (!VT.isSimple()) return false; // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus bool AllowsUnaligned = Subtarget->allowsUnalignedMem(); switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i8: case MVT::i16: case MVT::i32: { // Unaligned access can use (for example) LRDB, LRDH, LDR if (AllowsUnaligned) { if (Fast) *Fast = Subtarget->hasV7Ops(); return true; } return false; } case MVT::f64: case MVT::v2f64: { // For any little-endian targets with neon, we can support unaligned ld/st // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8. // A big-endian target may also explicitly support unaligned accesses if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) { if (Fast) *Fast = true; return true; } return false; } } } static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, unsigned AlignCheck) { return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && (DstAlign == 0 || DstAlign % AlignCheck == 0)); } EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { const Function &F = MF.getFunction(); // See if we can use NEON instructions for this... if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() && !F.hasFnAttribute(Attribute::NoImplicitFloat)) { bool Fast; if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) || (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) { return MVT::v2f64; } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) || (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) && Fast))) { return MVT::f64; } } // Let the target-independent logic figure it out. return MVT::Other; } // 64-bit integers are split into their high and low parts and held in two // different registers, so the trunc is free since the low register can just // be used. bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const { if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy()) return false; unsigned SrcBits = SrcTy->getPrimitiveSizeInBits(); unsigned DestBits = DstTy->getPrimitiveSizeInBits(); return (SrcBits == 64 && DestBits == 32); } bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const { if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() || !DstVT.isInteger()) return false; unsigned SrcBits = SrcVT.getSizeInBits(); unsigned DestBits = DstVT.getSizeInBits(); return (SrcBits == 64 && DestBits == 32); } bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const { if (Val.getOpcode() != ISD::LOAD) return false; EVT VT1 = Val.getValueType(); if (!VT1.isSimple() || !VT1.isInteger() || !VT2.isSimple() || !VT2.isInteger()) return false; switch (VT1.getSimpleVT().SimpleTy) { default: break; case MVT::i1: case MVT::i8: case MVT::i16: // 8-bit and 16-bit loads implicitly zero-extend to 32-bits. return true; } return false; } bool ARMTargetLowering::isFNegFree(EVT VT) const { if (!VT.isSimple()) return false; // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that // negate values directly (fneg is free). So, we don't want to let the DAG // combiner rewrite fneg into xors and some other instructions. For f16 and // FullFP16 argument passing, some bitcast nodes may be introduced, // triggering this DAG combine rewrite, so we are avoiding that with this. switch (VT.getSimpleVT().SimpleTy) { default: break; case MVT::f16: return Subtarget->hasFullFP16(); } return false; } bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { EVT VT = ExtVal.getValueType(); if (!isTypeLegal(VT)) return false; // Don't create a loadext if we can fold the extension into a wide/long // instruction. // If there's more than one user instruction, the loadext is desirable no // matter what. There can be two uses by the same instruction. if (ExtVal->use_empty() || !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode())) return true; SDNode *U = *ExtVal->use_begin(); if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB || U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL)) return false; return true; } bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) return false; if (!isTypeLegal(EVT::getEVT(Ty1))) return false; assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop"); // Assuming the caller doesn't have a zeroext or signext return parameter, // truncation all the way down to i1 is valid. return true; } int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const { if (isLegalAddressingMode(DL, AM, Ty, AS)) { if (Subtarget->hasFPAO()) return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster return 0; } return -1; } static bool isLegalT1AddressImmediate(int64_t V, EVT VT) { if (V < 0) return false; unsigned Scale = 1; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: // Scale == 1; break; case MVT::i16: // Scale == 2; Scale = 2; break; case MVT::i32: // Scale == 4; Scale = 4; break; } if ((V & (Scale - 1)) != 0) return false; V /= Scale; return V == (V & ((1LL << 5) - 1)); } static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget) { bool isNeg = false; if (V < 0) { isNeg = true; V = - V; } switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i16: case MVT::i32: // + imm12 or - imm8 if (isNeg) return V == (V & ((1LL << 8) - 1)); return V == (V & ((1LL << 12) - 1)); case MVT::f32: case MVT::f64: // Same as ARM mode. FIXME: NEON? if (!Subtarget->hasVFP2()) return false; if ((V & 3) != 0) return false; V >>= 2; return V == (V & ((1LL << 8) - 1)); } } /// isLegalAddressImmediate - Return true if the integer value can be used /// as the offset of the target addressing mode for load / store of the /// given type. static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget) { if (V == 0) return true; if (!VT.isSimple()) return false; if (Subtarget->isThumb1Only()) return isLegalT1AddressImmediate(V, VT); else if (Subtarget->isThumb2()) return isLegalT2AddressImmediate(V, VT, Subtarget); // ARM mode. if (V < 0) V = - V; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i32: // +- imm12 return V == (V & ((1LL << 12) - 1)); case MVT::i16: // +- imm8 return V == (V & ((1LL << 8) - 1)); case MVT::f32: case MVT::f64: if (!Subtarget->hasVFP2()) // FIXME: NEON? return false; if ((V & 3) != 0) return false; V >>= 2; return V == (V & ((1LL << 8) - 1)); } } bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const { int Scale = AM.Scale; if (Scale < 0) return false; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i16: case MVT::i32: if (Scale == 1) return true; // r + r << imm Scale = Scale & ~1; return Scale == 2 || Scale == 4 || Scale == 8; case MVT::i64: // FIXME: What are we trying to model here? ldrd doesn't have an r + r // version in Thumb mode. // r + r if (Scale == 1) return true; // r * 2 (this can be lowered to r + r). if (!AM.HasBaseReg && Scale == 2) return true; return false; case MVT::isVoid: // Note, we allow "void" uses (basically, uses that aren't loads or // stores), because arm allows folding a scale into many arithmetic // operations. This should be made more precise and revisited later. // Allow r << imm, but the imm has to be a multiple of two. if (Scale & 1) return false; return isPowerOf2_32(Scale); } } bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const { const int Scale = AM.Scale; // Negative scales are not supported in Thumb1. if (Scale < 0) return false; // Thumb1 addressing modes do not support register scaling excepting the // following cases: // 1. Scale == 1 means no scaling. // 2. Scale == 2 this can be lowered to r + r if there is no base register. return (Scale == 1) || (!AM.HasBaseReg && Scale == 2); } /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I) const { EVT VT = getValueType(DL, Ty, true); if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget)) return false; // Can never fold addr of global into load/store. if (AM.BaseGV) return false; switch (AM.Scale) { case 0: // no scale reg, must be "r+i" or "r", or "i". break; default: // ARM doesn't support any R+R*scale+imm addr modes. if (AM.BaseOffs) return false; if (!VT.isSimple()) return false; if (Subtarget->isThumb1Only()) return isLegalT1ScaledAddressingMode(AM, VT); if (Subtarget->isThumb2()) return isLegalT2ScaledAddressingMode(AM, VT); int Scale = AM.Scale; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: case MVT::i32: if (Scale < 0) Scale = -Scale; if (Scale == 1) return true; // r + r << imm return isPowerOf2_32(Scale & ~1); case MVT::i16: case MVT::i64: // r +/- r if (Scale == 1 || (AM.HasBaseReg && Scale == -1)) return true; // r * 2 (this can be lowered to r + r). if (!AM.HasBaseReg && Scale == 2) return true; return false; case MVT::isVoid: // Note, we allow "void" uses (basically, uses that aren't loads or // stores), because arm allows folding a scale into many arithmetic // operations. This should be made more precise and revisited later. // Allow r << imm, but the imm has to be a multiple of two. if (Scale & 1) return false; return isPowerOf2_32(Scale); } } return true; } /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can compare /// a register against the immediate without having to materialize the /// immediate into a register. bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { // Thumb2 and ARM modes can use cmn for negative immediates. if (!Subtarget->isThumb()) return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 || ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1; if (Subtarget->isThumb2()) return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 || ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1; // Thumb1 doesn't have cmn, and only 8-bit immediates. return Imm >= 0 && Imm <= 255; } /// isLegalAddImmediate - Return true if the specified immediate is a legal add /// *or sub* immediate, that is the target has add or sub instructions which can /// add a register with the immediate without having to materialize the /// immediate into a register. bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { // Same encoding for add/sub, just flip the sign. int64_t AbsImm = std::abs(Imm); if (!Subtarget->isThumb()) return ARM_AM::getSOImmVal(AbsImm) != -1; if (Subtarget->isThumb2()) return ARM_AM::getT2SOImmVal(AbsImm) != -1; // Thumb1 only has 8-bit unsigned immediate. return AbsImm >= 0 && AbsImm <= 255; } static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) return false; if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) { // AddressingMode 3 Base = Ptr->getOperand(0); if (ConstantSDNode *RHS = dyn_cast(Ptr->getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); if (RHSC < 0 && RHSC > -256) { assert(Ptr->getOpcode() == ISD::ADD); isInc = false; Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } } isInc = (Ptr->getOpcode() == ISD::ADD); Offset = Ptr->getOperand(1); return true; } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) { // AddressingMode 2 if (ConstantSDNode *RHS = dyn_cast(Ptr->getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); if (RHSC < 0 && RHSC > -0x1000) { assert(Ptr->getOpcode() == ISD::ADD); isInc = false; Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); Base = Ptr->getOperand(0); return true; } } if (Ptr->getOpcode() == ISD::ADD) { isInc = true; ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode()); if (ShOpcVal != ARM_AM::no_shift) { Base = Ptr->getOperand(1); Offset = Ptr->getOperand(0); } else { Base = Ptr->getOperand(0); Offset = Ptr->getOperand(1); } return true; } isInc = (Ptr->getOpcode() == ISD::ADD); Base = Ptr->getOperand(0); Offset = Ptr->getOperand(1); return true; } // FIXME: Use VLDM / VSTM to emulate indexed FP load / store. return false; } static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG) { if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB) return false; Base = Ptr->getOperand(0); if (ConstantSDNode *RHS = dyn_cast(Ptr->getOperand(1))) { int RHSC = (int)RHS->getZExtValue(); if (RHSC < 0 && RHSC > -0x100) { // 8 bits. assert(Ptr->getOpcode() == ISD::ADD); isInc = false; Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero. isInc = Ptr->getOpcode() == ISD::ADD; Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0)); return true; } } return false; } /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. bool ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { if (Subtarget->isThumb1Only()) return false; EVT VT; SDValue Ptr; bool isSEXTLoad = false; if (LoadSDNode *LD = dyn_cast(N)) { Ptr = LD->getBasePtr(); VT = LD->getMemoryVT(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; } else if (StoreSDNode *ST = dyn_cast(N)) { Ptr = ST->getBasePtr(); VT = ST->getMemoryVT(); } else return false; bool isInc; bool isLegal = false; if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, Offset, isInc, DAG); else isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base, Offset, isInc, DAG); if (!isLegal) return false; AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC; return true; } /// getPostIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if this node can be /// combined with a load / store to form a post-indexed load / store. bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { EVT VT; SDValue Ptr; bool isSEXTLoad = false, isNonExt; if (LoadSDNode *LD = dyn_cast(N)) { VT = LD->getMemoryVT(); Ptr = LD->getBasePtr(); isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD; isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD; } else if (StoreSDNode *ST = dyn_cast(N)) { VT = ST->getMemoryVT(); Ptr = ST->getBasePtr(); isNonExt = !ST->isTruncatingStore(); } else return false; if (Subtarget->isThumb1Only()) { // Thumb-1 can do a limited post-inc load or store as an updating LDM. It // must be non-extending/truncating, i32, with an offset of 4. assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!"); if (Op->getOpcode() != ISD::ADD || !isNonExt) return false; auto *RHS = dyn_cast(Op->getOperand(1)); if (!RHS || RHS->getZExtValue() != 4) return false; Offset = Op->getOperand(1); Base = Op->getOperand(0); AM = ISD::POST_INC; return true; } bool isInc; bool isLegal = false; if (Subtarget->isThumb2()) isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); else isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset, isInc, DAG); if (!isLegal) return false; if (Ptr != Base) { // Swap base ptr and offset to catch more post-index load / store when // it's legal. In Thumb2 mode, offset must be an immediate. if (Ptr == Offset && Op->getOpcode() == ISD::ADD && !Subtarget->isThumb2()) std::swap(Base, Offset); // Post-indexed load / store update the base pointer. if (Ptr != Base) return false; } AM = isInc ? ISD::POST_INC : ISD::POST_DEC; return true; } void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { unsigned BitWidth = Known.getBitWidth(); Known.resetAll(); switch (Op.getOpcode()) { default: break; case ARMISD::ADDC: case ARMISD::ADDE: case ARMISD::SUBC: case ARMISD::SUBE: // Special cases when we convert a carry to a boolean. if (Op.getResNo() == 0) { SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); // (ADDE 0, 0, C) will give us a single bit. if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) && isNullConstant(RHS)) { Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1); return; } } break; case ARMISD::CMOV: { // Bits are known zero/one if known on the LHS and RHS. DAG.computeKnownBits(Op.getOperand(0), Known, Depth+1); if (Known.isUnknown()) return; KnownBits KnownRHS; DAG.computeKnownBits(Op.getOperand(1), KnownRHS, Depth+1); Known.Zero &= KnownRHS.Zero; Known.One &= KnownRHS.One; return; } case ISD::INTRINSIC_W_CHAIN: { ConstantSDNode *CN = cast(Op->getOperand(1)); Intrinsic::ID IntID = static_cast(CN->getZExtValue()); switch (IntID) { default: return; case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { EVT VT = cast(Op)->getMemoryVT(); unsigned MemBits = VT.getScalarSizeInBits(); Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); return; } } } case ARMISD::BFI: { // Conservatively, we can recurse down the first operand // and just mask out all affected bits. DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1); // The operand to BFI is already a mask suitable for removing the bits it // sets. ConstantSDNode *CI = cast(Op.getOperand(2)); const APInt &Mask = CI->getAPIntValue(); Known.Zero &= Mask; Known.One &= Mask; return; } } } //===----------------------------------------------------------------------===// // ARM Inline Assembly Support //===----------------------------------------------------------------------===// bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const { // Looking for "rev" which is V6+. if (!Subtarget->hasV6Ops()) return false; InlineAsm *IA = cast(CI->getCalledValue()); std::string AsmStr = IA->getAsmString(); SmallVector AsmPieces; SplitString(AsmStr, AsmPieces, ";\n"); switch (AsmPieces.size()) { default: return false; case 1: AsmStr = AsmPieces[0]; AsmPieces.clear(); SplitString(AsmStr, AsmPieces, " \t,"); // rev $0, $1 if (AsmPieces.size() == 3 && AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" && IA->getConstraintString().compare(0, 4, "=l,l") == 0) { IntegerType *Ty = dyn_cast(CI->getType()); if (Ty && Ty->getBitWidth() == 32) return IntrinsicLowering::LowerToByteSwap(CI); } break; } return false; } const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const { // At this point, we have to lower this constraint to something else, so we // lower it to an "r" or "w". However, by doing this we will force the result // to be in register, while the X constraint is much more permissive. // // Although we are correct (we are free to emit anything, without // constraints), we might break use cases that would expect us to be more // efficient and emit something else. if (!Subtarget->hasVFP2()) return "r"; if (ConstraintVT.isFloatingPoint()) return "w"; if (ConstraintVT.isVector() && Subtarget->hasNEON() && (ConstraintVT.getSizeInBits() == 64 || ConstraintVT.getSizeInBits() == 128)) return "w"; return "r"; } /// getConstraintType - Given a constraint letter, return the type of /// constraint it is for this target. ARMTargetLowering::ConstraintType ARMTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; case 'l': return C_RegisterClass; case 'w': return C_RegisterClass; case 'h': return C_RegisterClass; case 'x': return C_RegisterClass; case 't': return C_RegisterClass; case 'j': return C_Other; // Constant for movw. // An address with a single base register. Due to the way we // currently handle addresses it is the same as an 'r' memory constraint. case 'Q': return C_Memory; } } else if (Constraint.size() == 2) { switch (Constraint[0]) { default: break; // All 'U+' constraints are addresses. case 'U': return C_Memory; } } return TargetLowering::getConstraintType(Constraint); } /// Examine constraint type and operand type and determine a weight value. /// This object must already have been set up with the operand type /// and the current alternative constraint selected. TargetLowering::ConstraintWeight ARMTargetLowering::getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const { ConstraintWeight weight = CW_Invalid; Value *CallOperandVal = info.CallOperandVal; // If we don't have a value, we can't do a match, // but allow it at the lowest weight. if (!CallOperandVal) return CW_Default; Type *type = CallOperandVal->getType(); // Look at the constraint type. switch (*constraint) { default: weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); break; case 'l': if (type->isIntegerTy()) { if (Subtarget->isThumb()) weight = CW_SpecificReg; else weight = CW_Register; } break; case 'w': if (type->isFloatingPointTy()) weight = CW_Register; break; } return weight; } using RCPair = std::pair; RCPair ARMTargetLowering::getRegForInlineAsmConstraint( const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const { if (Constraint.size() == 1) { // GCC ARM Constraint Letters switch (Constraint[0]) { case 'l': // Low regs or general regs. if (Subtarget->isThumb()) return RCPair(0U, &ARM::tGPRRegClass); return RCPair(0U, &ARM::GPRRegClass); case 'h': // High regs or no regs. if (Subtarget->isThumb()) return RCPair(0U, &ARM::hGPRRegClass); break; case 'r': if (Subtarget->isThumb1Only()) return RCPair(0U, &ARM::tGPRRegClass); return RCPair(0U, &ARM::GPRRegClass); case 'w': if (VT == MVT::Other) break; if (VT == MVT::f32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) return RCPair(0U, &ARM::DPRRegClass); if (VT.getSizeInBits() == 128) return RCPair(0U, &ARM::QPRRegClass); break; case 'x': if (VT == MVT::Other) break; if (VT == MVT::f32) return RCPair(0U, &ARM::SPR_8RegClass); if (VT.getSizeInBits() == 64) return RCPair(0U, &ARM::DPR_8RegClass); if (VT.getSizeInBits() == 128) return RCPair(0U, &ARM::QPR_8RegClass); break; case 't': if (VT == MVT::Other) break; if (VT == MVT::f32 || VT == MVT::i32) return RCPair(0U, &ARM::SPRRegClass); if (VT.getSizeInBits() == 64) return RCPair(0U, &ARM::DPR_VFP2RegClass); if (VT.getSizeInBits() == 128) return RCPair(0U, &ARM::QPR_VFP2RegClass); break; } } if (StringRef("{cc}").equals_lower(Constraint)) return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass); return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector&Ops, SelectionDAG &DAG) const { SDValue Result; // Currently only support length 1 constraints. if (Constraint.length() != 1) return; char ConstraintLetter = Constraint[0]; switch (ConstraintLetter) { default: break; case 'j': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': ConstantSDNode *C = dyn_cast(Op); if (!C) return; int64_t CVal64 = C->getSExtValue(); int CVal = (int) CVal64; // None of these constraints allow values larger than 32 bits. Check // that the value fits in an int. if (CVal != CVal64) return; switch (ConstraintLetter) { case 'j': // Constant suitable for movw, must be between 0 and // 65535. if (Subtarget->hasV6T2Ops()) if (CVal >= 0 && CVal <= 65535) break; return; case 'I': if (Subtarget->isThumb1Only()) { // This must be a constant between 0 and 255, for ADD // immediates. if (CVal >= 0 && CVal <= 255) break; } else if (Subtarget->isThumb2()) { // A constant that can be used as an immediate value in a // data-processing instruction. if (ARM_AM::getT2SOImmVal(CVal) != -1) break; } else { // A constant that can be used as an immediate value in a // data-processing instruction. if (ARM_AM::getSOImmVal(CVal) != -1) break; } return; case 'J': if (Subtarget->isThumb1Only()) { // This must be a constant between -255 and -1, for negated ADD // immediates. This can be used in GCC with an "n" modifier that // prints the negated value, for use with SUB instructions. It is // not useful otherwise but is implemented for compatibility. if (CVal >= -255 && CVal <= -1) break; } else { // This must be a constant between -4095 and 4095. It is not clear // what this constraint is intended for. Implemented for // compatibility with GCC. if (CVal >= -4095 && CVal <= 4095) break; } return; case 'K': if (Subtarget->isThumb1Only()) { // A 32-bit value where only one byte has a nonzero value. Exclude // zero to match GCC. This constraint is used by GCC internally for // constants that can be loaded with a move/shift combination. // It is not useful otherwise but is implemented for compatibility. if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal)) break; } else if (Subtarget->isThumb2()) { // A constant whose bitwise inverse can be used as an immediate // value in a data-processing instruction. This can be used in GCC // with a "B" modifier that prints the inverted value, for use with // BIC and MVN instructions. It is not useful otherwise but is // implemented for compatibility. if (ARM_AM::getT2SOImmVal(~CVal) != -1) break; } else { // A constant whose bitwise inverse can be used as an immediate // value in a data-processing instruction. This can be used in GCC // with a "B" modifier that prints the inverted value, for use with // BIC and MVN instructions. It is not useful otherwise but is // implemented for compatibility. if (ARM_AM::getSOImmVal(~CVal) != -1) break; } return; case 'L': if (Subtarget->isThumb1Only()) { // This must be a constant between -7 and 7, // for 3-operand ADD/SUB immediate instructions. if (CVal >= -7 && CVal < 7) break; } else if (Subtarget->isThumb2()) { // A constant whose negation can be used as an immediate value in a // data-processing instruction. This can be used in GCC with an "n" // modifier that prints the negated value, for use with SUB // instructions. It is not useful otherwise but is implemented for // compatibility. if (ARM_AM::getT2SOImmVal(-CVal) != -1) break; } else { // A constant whose negation can be used as an immediate value in a // data-processing instruction. This can be used in GCC with an "n" // modifier that prints the negated value, for use with SUB // instructions. It is not useful otherwise but is implemented for // compatibility. if (ARM_AM::getSOImmVal(-CVal) != -1) break; } return; case 'M': if (Subtarget->isThumb1Only()) { // This must be a multiple of 4 between 0 and 1020, for // ADD sp + immediate. if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0)) break; } else { // A power of two or a constant between 0 and 32. This is used in // GCC for the shift amount on shifted register operands, but it is // useful in general for any shift amounts. if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0)) break; } return; case 'N': if (Subtarget->isThumb()) { // FIXME thumb2 // This must be a constant between 0 and 31, for shift amounts. if (CVal >= 0 && CVal <= 31) break; } return; case 'O': if (Subtarget->isThumb()) { // FIXME thumb2 // This must be a multiple of 4 between -508 and 508, for // ADD/SUB sp = sp + immediate. if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0)) break; } return; } Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType()); break; } if (Result.getNode()) { Ops.push_back(Result); return; } return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); } static RTLIB::Libcall getDivRemLibcall( const SDNode *N, MVT::SimpleValueType SVT) { assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && "Unhandled Opcode in getDivRemLibcall"); bool isSigned = N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::SREM; RTLIB::Libcall LC; switch (SVT) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break; case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break; case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break; case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break; } return LC; } static TargetLowering::ArgListTy getDivRemArgList( const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) { assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM || N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) && "Unhandled Opcode in getDivRemArgList"); bool isSigned = N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::SREM; TargetLowering::ArgListTy Args; TargetLowering::ArgListEntry Entry; for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { EVT ArgVT = N->getOperand(i).getValueType(); Type *ArgTy = ArgVT.getTypeForEVT(*Context); Entry.Node = N->getOperand(i); Entry.Ty = ArgTy; Entry.IsSExt = isSigned; Entry.IsZExt = !isSigned; Args.push_back(Entry); } if (Subtarget->isTargetWindows() && Args.size() >= 2) std::swap(Args[0], Args[1]); return Args; } SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() || Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() || Subtarget->isTargetWindows()) && "Register-based DivRem lowering only"); unsigned Opcode = Op->getOpcode(); assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) && "Invalid opcode for Div/Rem lowering"); bool isSigned = (Opcode == ISD::SDIVREM); EVT VT = Op->getValueType(0); Type *Ty = VT.getTypeForEVT(*DAG.getContext()); SDLoc dl(Op); // If the target has hardware divide, use divide + multiply + subtract: // div = a / b // rem = a - b * div // return {div, rem} // This should be lowered into UDIV/SDIV + MLS later on. bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode() : Subtarget->hasDivideInARMMode(); if (hasDivide && Op->getValueType(0).isSimple() && Op->getSimpleValueType(0) == MVT::i32) { unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV; const SDValue Dividend = Op->getOperand(0); const SDValue Divisor = Op->getOperand(1); SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor); SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor); SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul); SDValue Values[2] = {Div, Rem}; return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values); } RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(), VT.getSimpleVT().SimpleTy); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(), DAG.getContext(), Subtarget); SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); Type *RetTy = StructType::get(Ty, Ty); if (Subtarget->isTargetWindows()) InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl).setChain(InChain) .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args)) .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned); std::pair CallInfo = LowerCallTo(CLI); return CallInfo.first; } // Lowers REM using divmod helpers // see RTABI section 4.2/4.3 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const { // Build return types (div and rem) std::vector RetTyParams; Type *RetTyElement; switch (N->getValueType(0).getSimpleVT().SimpleTy) { default: llvm_unreachable("Unexpected request for libcall!"); case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break; case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break; case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break; case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break; } RetTyParams.push_back(RetTyElement); RetTyParams.push_back(RetTyElement); ArrayRef ret = ArrayRef(RetTyParams); Type *RetTy = StructType::get(*DAG.getContext(), ret); RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT(). SimpleTy); SDValue InChain = DAG.getEntryNode(); TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(), Subtarget); bool isSigned = N->getOpcode() == ISD::SREM; SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); if (Subtarget->isTargetWindows()) InChain = WinDBZCheckDenominator(DAG, N, InChain); // Lower call CallLoweringInfo CLI(DAG); CLI.setChain(InChain) .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args)) .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N)); std::pair CallResult = LowerCallTo(CLI); // Return second (rem) result operand (first contains div) SDNode *ResNode = CallResult.first.getNode(); assert(ResNode->getNumOperands() == 2 && "divmod should return two operands"); return ResNode->getOperand(1); } SDValue ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetWindows() && "unsupported target platform"); SDLoc DL(Op); // Get the inputs. SDValue Chain = Op.getOperand(0); SDValue Size = Op.getOperand(1); if (DAG.getMachineFunction().getFunction().hasFnAttribute( "no-stack-arg-probe")) { unsigned Align = cast(Op.getOperand(2))->getZExtValue(); SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); Chain = SP.getValue(1); SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size); if (Align) SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0), DAG.getConstant(-(uint64_t)Align, DL, MVT::i32)); Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP); SDValue Ops[2] = { SP, Chain }; return DAG.getMergeValues(Ops, DL); } SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size, DAG.getConstant(2, DL, MVT::i32)); SDValue Flag; Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag); Flag = Chain.getValue(1); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag); SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32); Chain = NewSP.getValue(1); SDValue Ops[2] = { NewSP, Chain }; return DAG.getMergeValues(Ops, DL); } SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && "Unexpected type for custom-lowering FP_EXTEND"); RTLIB::Libcall LC; LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, SDLoc(Op)).first; } SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { assert(Op.getOperand(0).getValueType() == MVT::f64 && Subtarget->isFPOnlySP() && "Unexpected type for custom-lowering FP_ROUND"); RTLIB::Libcall LC; LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); SDValue SrcVal = Op.getOperand(0); return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false, SDLoc(Op)).first; } bool ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // The ARM target isn't yet aware of offsets. return false; } bool ARM::isBitFieldInvertedMask(unsigned v) { if (v == 0xffffffff) return false; // there can be 1's on either or both "outsides", all the "inside" // bits must be 0's return isShiftedMask_32(~v); } /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { if (!Subtarget->hasVFP3()) return false; if (VT == MVT::f16 && Subtarget->hasFullFP16()) return ARM_AM::getFP16Imm(Imm) != -1; if (VT == MVT::f32) return ARM_AM::getFP32Imm(Imm) != -1; if (VT == MVT::f64 && !Subtarget->isFPOnlySP()) return ARM_AM::getFP64Imm(Imm) != -1; return false; } /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const { switch (Intrinsic) { case Intrinsic::arm_neon_vld1: case Intrinsic::arm_neon_vld2: case Intrinsic::arm_neon_vld3: case Intrinsic::arm_neon_vld4: case Intrinsic::arm_neon_vld2lane: case Intrinsic::arm_neon_vld3lane: case Intrinsic::arm_neon_vld4lane: case Intrinsic::arm_neon_vld2dup: case Intrinsic::arm_neon_vld3dup: case Intrinsic::arm_neon_vld4dup: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); Info.align = cast(AlignArg)->getZExtValue(); // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::arm_neon_vld1x2: case Intrinsic::arm_neon_vld1x3: case Intrinsic::arm_neon_vld1x4: { Info.opc = ISD::INTRINSIC_W_CHAIN; // Conservatively set memVT to the entire set of vectors loaded. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64; Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); Info.offset = 0; Info.align = 0; // volatile loads with NEON intrinsics not supported Info.flags = MachineMemOperand::MOLoad; return true; } case Intrinsic::arm_neon_vst1: case Intrinsic::arm_neon_vst2: case Intrinsic::arm_neon_vst3: case Intrinsic::arm_neon_vst4: case Intrinsic::arm_neon_vst2lane: case Intrinsic::arm_neon_vst3lane: case Intrinsic::arm_neon_vst4lane: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); unsigned NumElts = 0; for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1); Info.align = cast(AlignArg)->getZExtValue(); // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::arm_neon_vst1x2: case Intrinsic::arm_neon_vst1x3: case Intrinsic::arm_neon_vst1x4: { Info.opc = ISD::INTRINSIC_VOID; // Conservatively set memVT to the entire set of vectors stored. auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); unsigned NumElts = 0; for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { Type *ArgTy = I.getArgOperand(ArgI)->getType(); if (!ArgTy->isVectorTy()) break; NumElts += DL.getTypeSizeInBits(ArgTy) / 64; } Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = 0; // volatile stores with NEON intrinsics not supported Info.flags = MachineMemOperand::MOStore; return true; } case Intrinsic::arm_ldaex: case Intrinsic::arm_ldrex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast(I.getArgOperand(0)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; } case Intrinsic::arm_stlex: case Intrinsic::arm_strex: { auto &DL = I.getCalledFunction()->getParent()->getDataLayout(); PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(PtrTy->getElementType()); Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.align = DL.getABITypeAlignment(PtrTy->getElementType()); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; } case Intrinsic::arm_stlexd: case Intrinsic::arm_strexd: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(2); Info.offset = 0; Info.align = 8; Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; case Intrinsic::arm_ldaexd: case Intrinsic::arm_ldrexd: Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::i64; Info.ptrVal = I.getArgOperand(0); Info.offset = 0; Info.align = 8; Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile; return true; default: break; } return false; } /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned Bits = Ty->getPrimitiveSizeInBits(); if (Bits == 0 || Bits > 32) return false; return true; } bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; return (Index == 0 || Index == ResVT.getVectorNumElements()); } Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); // First, if the target has no DMB, see what fallback we can use. if (!Subtarget->hasDataBarrier()) { // Some ARMv6 cpus can support data barriers with an mcr instruction. // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(7), Builder.getInt32(10), Builder.getInt32(5)}; return Builder.CreateCall(MCR, args); } else { // Instead of using barriers, atomic accesses on these subtargets use // libcalls. llvm_unreachable("makeDMB on a target so old that it has no barriers"); } } else { Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); // Only a full system barrier exists in the M-class architectures. Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; Constant *CDomain = Builder.getInt32(Domain); return Builder.CreateCall(DMB, CDomain); } } // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: llvm_unreachable("Invalid fence: unordered/non-atomic"); case AtomicOrdering::Monotonic: case AtomicOrdering::Acquire: return nullptr; // Nothing to do case AtomicOrdering::SequentiallyConsistent: if (!Inst->hasAtomicStore()) return nullptr; // Nothing to do LLVM_FALLTHROUGH; case AtomicOrdering::Release: case AtomicOrdering::AcquireRelease: if (Subtarget->preferISHSTBarriers()) return makeDMB(Builder, ARM_MB::ISHST); // FIXME: add a comment with a link to documentation justifying this. else return makeDMB(Builder, ARM_MB::ISH); } llvm_unreachable("Unknown fence ordering in emitLeadingFence"); } Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: llvm_unreachable("Invalid fence: unordered/not-atomic"); case AtomicOrdering::Monotonic: case AtomicOrdering::Release: return nullptr; // Nothing to do case AtomicOrdering::Acquire: case AtomicOrdering::AcquireRelease: case AtomicOrdering::SequentiallyConsistent: return makeDMB(Builder, ARM_MB::ISH); } llvm_unreachable("Unknown fence ordering in emitTrailingFence"); } // Loads and stores less than 64-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit // anything for those. bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); return (Size == 64) && !Subtarget->isMClass(); } // Loads and stores less than 64-bits are already atomic; ones above that // are doomed anyway, so defer to the default libcall and blame the OS when // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit // anything for those. // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that // guarantee, see DDI0406C ARM architecture reference manual, // sections A8.8.72-74 LDRD) TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { unsigned Size = LI->getType()->getPrimitiveSizeInBits(); return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly : AtomicExpansionKind::None; } // For the real atomic operations, we have ldrex/strex up to 32 bits, // and up to 64 bits on the non-M profiles TargetLowering::AtomicExpansionKind ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None; } bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR( AtomicCmpXchgInst *AI) const { // At -O0, fast-regalloc cannot cope with the live vregs necessary to // implement cmpxchg without spilling. If the address being exchanged is also // on the stack and close enough to the spill slot, this can lead to a // situation where the monitor always gets cleared and the atomic operation // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead. bool hasAtomicCmpXchg = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps(); return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg; } bool ARMTargetLowering::shouldInsertFencesForAtomic( const Instruction *I) const { return InsertFencesForAtomic; } // This has so far only been implemented for MachO. bool ARMTargetLowering::useLoadStackGuardNode() const { return Subtarget->isTargetMachO(); } bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const { // If we do not have NEON, vector types are not natively supported. if (!Subtarget->hasNEON()) return false; // Floating point values and vector values map to the same register file. // Therefore, although we could do a store extract of a vector type, this is // better to leave at float as we have more freedom in the addressing mode for // those. if (VectorTy->isFPOrFPVectorTy()) return false; // If the index is unknown at compile time, this is very expensive to lower // and it is not possible to combine the store with the extract. if (!isa(Idx)) return false; assert(VectorTy->isVectorTy() && "VectorTy is not a vector type"); unsigned BitWidth = cast(VectorTy)->getBitWidth(); // We can do a store + vector extract on any vector that fits perfectly in a D // or Q register. if (BitWidth == 64 || BitWidth == 128) { Cost = 0; return true; } return false; } bool ARMTargetLowering::isCheapToSpeculateCttz() const { return Subtarget->hasV6T2Ops(); } bool ARMTargetLowering::isCheapToSpeculateCtlz() const { return Subtarget->hasV6T2Ops(); } Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = cast(Addr->getType())->getElementType(); bool IsAcquire = isAcquireOrStronger(Ord); // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd // intrinsic must return {i32, i32} and we have to recombine them into a // single i64 here. if (ValTy->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; Function *Ldrex = Intrinsic::getDeclaration(M, Int); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); if (!Subtarget->isLittle()) std::swap (Lo, Hi); Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); return Builder.CreateOr( Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64"); } Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateTruncOrBitCast( Builder.CreateCall(Ldrex, Addr), cast(Addr->getType())->getElementType()); } void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilder<> &Builder) const { if (!Subtarget->hasV7Ops()) return; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); } Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); bool IsRelease = isReleaseOrStronger(Ord); // Since the intrinsics must have legal type, the i64 intrinsics take two // parameters: "i32, i32". We must marshal Val into the appropriate form // before the call. if (Val->getType()->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; Function *Strex = Intrinsic::getDeclaration(M, Int); Type *Int32Ty = Type::getInt32Ty(M->getContext()); Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi"); if (!Subtarget->isLittle()) std::swap(Lo, Hi); Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); return Builder.CreateCall(Strex, {Lo, Hi, Addr}); } Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; Type *Tys[] = { Addr->getType() }; Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); return Builder.CreateCall( Strex, {Builder.CreateZExtOrBitCast( Val, Strex->getFunctionType()->getParamType(0)), Addr}); } /// A helper function for determining the number of interleaved accesses we /// will generate when lowering accesses of the given type. unsigned ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const { return (DL.getTypeSizeInBits(VecTy) + 127) / 128; } bool ARMTargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); // Ensure the vector doesn't have f16 elements. Even though we could do an // i16 vldN, we can't hold the f16 vectors and will end up converting via // f32. if (VecTy->getElementType()->isHalfTy()) return false; // Ensure the number of vector elements is greater than 1. if (VecTy->getNumElements() < 2) return false; // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32) return false; // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. return VecSize == 64 || VecSize % 128 == 0; } /// Lower an interleaved load into a vldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements /// /// Into: /// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4) /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1 bool ARMTargetLowering::lowerInterleavedLoad( LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); assert(!Shuffles.empty() && "Empty shufflevector input"); assert(Shuffles.size() == Indices.size() && "Unmatched number of shufflevectors and indices"); VectorType *VecTy = Shuffles[0]->getType(); Type *EltTy = VecTy->getVectorElementType(); const DataLayout &DL = LI->getModule()->getDataLayout(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL)) return false; unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL); // A pointer vector can not be the return type of the ldN intrinsics. Need to // load integer vectors first and then convert to pointer vectors. if (EltTy->isPointerTy()) VecTy = VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements()); IRBuilder<> Builder(LI); // The base address of the load. Value *BaseAddr = LI->getPointerOperand(); if (NumLoads > 1) { // If we're going to generate more than one load, reset the sub-vector type // to something legal. VecTy = VectorType::get(VecTy->getVectorElementType(), VecTy->getVectorNumElements() / NumLoads); // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, VecTy->getVectorElementType()->getPointerTo( LI->getPointerAddressSpace())); } assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!"); Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace()); Type *Tys[] = {VecTy, Int8Ptr}; static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; Function *VldnFunc = Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will // replace. DenseMap> SubVecs; for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) BaseAddr = Builder.CreateConstGEP1_32( BaseAddr, VecTy->getVectorNumElements() * Factor); SmallVector Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); Ops.push_back(Builder.getInt32(LI->getAlignment())); CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN"); // Replace uses of each shufflevector with the corresponding vector loaded // by ldN. for (unsigned i = 0; i < Shuffles.size(); i++) { ShuffleVectorInst *SV = Shuffles[i]; unsigned Index = Indices[i]; Value *SubVec = Builder.CreateExtractValue(VldN, Index); // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( SubVec, VectorType::get(SV->getType()->getVectorElementType(), VecTy->getVectorNumElements())); SubVecs[SV].push_back(SubVec); } } // Replace uses of the shufflevector instructions with the sub-vectors // returned by the load intrinsic. If a shufflevector instruction is // associated with more than one sub-vector, those sub-vectors will be // concatenated into a single wide vector. for (ShuffleVectorInst *SVI : Shuffles) { auto &SubVec = SubVecs[SVI]; auto *WideVec = SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0]; SVI->replaceAllUsesWith(WideVec); } return true; } /// Lower an interleaved store into a vstN intrinsic. /// /// E.g. Lower an interleaved store (Factor = 3): /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> /// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4 /// /// Into: /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) /// /// Note that the new shufflevectors will be removed and we'll only generate one /// vst3 instruction in CodeGen. /// /// Example for a more general valid mask (Factor 3). Lower: /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1, /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19> /// store <12 x i32> %i.vec, <12 x i32>* %ptr /// /// Into: /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7> /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35> /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19> /// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4) bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && "Invalid interleave factor"); VectorType *VecTy = SVI->getType(); assert(VecTy->getVectorNumElements() % Factor == 0 && "Invalid interleaved store"); unsigned LaneLen = VecTy->getVectorNumElements() / Factor; Type *EltTy = VecTy->getVectorElementType(); VectorType *SubVecTy = VectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) return false; unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); IRBuilder<> Builder(SI); // StN intrinsics don't support pointer vectors as arguments. Convert pointer // vectors to integer vectors. if (EltTy->isPointerTy()) { Type *IntTy = DL.getIntPtrType(EltTy); // Convert to the corresponding integer vector. Type *IntVecTy = VectorType::get(IntTy, Op0->getType()->getVectorNumElements()); Op0 = Builder.CreatePtrToInt(Op0, IntVecTy); Op1 = Builder.CreatePtrToInt(Op1, IntVecTy); SubVecTy = VectorType::get(IntTy, LaneLen); } // The base address of the store. Value *BaseAddr = SI->getPointerOperand(); if (NumStores > 1) { // If we're going to generate more than one store, reset the lane length // and sub-vector type to something legal. LaneLen /= NumStores; SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen); // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, SubVecTy->getVectorElementType()->getPointerTo( SI->getPointerAddressSpace())); } assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!"); auto Mask = SVI->getShuffleMask(); Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace()); Type *Tys[] = {Int8Ptr, SubVecTy}; static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2, Intrinsic::arm_neon_vst3, Intrinsic::arm_neon_vst4}; for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor); SmallVector Ops; Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr)); Function *VstNFunc = Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); // Split the shufflevector operands into sub vectors for the new vstN call. for (unsigned i = 0; i < Factor; i++) { unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { Ops.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0))); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { unsigned IdxJ = StoreCount * LaneLen * Factor + j; if (Mask[IdxJ * Factor + IdxI] >= 0) { StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ; break; } } // Note: If all elements in a chunk are undefs, StartMask=0! // Note: Filling undef gaps with random elements is ok, since // those elements were being written anyway (with undefs). // In the case of all undefs we're defaulting to using elems from 0 // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask Ops.push_back(Builder.CreateShuffleVector( Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0))); } } Ops.push_back(Builder.getInt32(SI->getAlignment())); Builder.CreateCall(VstNFunc, Ops); } return true; } enum HABaseType { HA_UNKNOWN = 0, HA_FLOAT, HA_DOUBLE, HA_VECT64, HA_VECT128 }; static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members) { if (auto *ST = dyn_cast(Ty)) { for (unsigned i = 0; i < ST->getNumElements(); ++i) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers)) return false; Members += SubMembers; } } else if (auto *AT = dyn_cast(Ty)) { uint64_t SubMembers = 0; if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers)) return false; Members += SubMembers * AT->getNumElements(); } else if (Ty->isFloatTy()) { if (Base != HA_UNKNOWN && Base != HA_FLOAT) return false; Members = 1; Base = HA_FLOAT; } else if (Ty->isDoubleTy()) { if (Base != HA_UNKNOWN && Base != HA_DOUBLE) return false; Members = 1; Base = HA_DOUBLE; } else if (auto *VT = dyn_cast(Ty)) { Members = 1; switch (Base) { case HA_FLOAT: case HA_DOUBLE: return false; case HA_VECT64: return VT->getBitWidth() == 64; case HA_VECT128: return VT->getBitWidth() == 128; case HA_UNKNOWN: switch (VT->getBitWidth()) { case 64: Base = HA_VECT64; return true; case 128: Base = HA_VECT128; return true; default: return false; } } } return (Members > 0 && Members <= 4); } /// Return the correct alignment for the current calling convention. unsigned ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy, DataLayout DL) const { if (!ArgTy->isVectorTy()) return DL.getABITypeAlignment(ArgTy); // Avoid over-aligning vector parameters. It would require realigning the // stack and waste space for no real benefit. return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment()); } /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when /// passing according to AAPCS rules. bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { if (getEffectiveCallingConv(CallConv, isVarArg) != CallingConv::ARM_AAPCS_VFP) return false; HABaseType Base = HA_UNKNOWN; uint64_t Members = 0; bool IsHA = isHomogeneousAggregate(Ty, Base, Members); LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump()); bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy(); return IsHA || IsIntArray; } unsigned ARMTargetLowering::getExceptionPointerRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0; } unsigned ARMTargetLowering::getExceptionSelectorRegister( const Constant *PersonalityFn) const { // Platforms which do not use SjLj EH may return values in these registers // via the personality function. return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1; } void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { // Update IsSplitCSR in ARMFunctionInfo. ARMFunctionInfo *AFI = Entry->getParent()->getInfo(); AFI->setIsSplitCSR(true); } void ARMTargetLowering::insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const { const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo(); const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); if (!IStart) return; const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); MachineBasicBlock::iterator MBBI = Entry->begin(); for (const MCPhysReg *I = IStart; *I; ++I) { const TargetRegisterClass *RC = nullptr; if (ARM::GPRRegClass.contains(*I)) RC = &ARM::GPRRegClass; else if (ARM::DPRRegClass.contains(*I)) RC = &ARM::DPRRegClass; else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); unsigned NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. // FIXME: this currently does not emit CFI pseudo-instructions, it works // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) .addReg(*I); // Insert the copy-back instructions right before the terminator. for (auto *Exit : Exits) BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), TII->get(TargetOpcode::COPY), *I) .addReg(NewVR); } } void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const { MF.getFrameInfo().computeMaxCallFrameSize(MF); TargetLoweringBase::finalizeLowering(MF); } Index: head/contrib/llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- head/contrib/llvm/lib/Target/ARM/ARMISelLowering.h (revision 344055) +++ head/contrib/llvm/lib/Target/ARM/ARMISelLowering.h (revision 344056) @@ -1,809 +1,812 @@ //===- ARMISelLowering.h - ARM DAG Lowering Interface -----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the interfaces that ARM uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H #define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H #include "MCTargetDesc/ARMBaseInfo.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/SelectionDAGNodes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InlineAsm.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/MachineValueType.h" #include namespace llvm { class ARMSubtarget; class DataLayout; class FastISel; class FunctionLoweringInfo; class GlobalValue; class InstrItineraryData; class Instruction; class MachineBasicBlock; class MachineInstr; class SelectionDAG; class TargetLibraryInfo; class TargetMachine; class TargetRegisterInfo; class VectorType; namespace ARMISD { // ARM Specific DAG Nodes enum NodeType : unsigned { // Start the numbering where the builtin ops and target ops leave off. FIRST_NUMBER = ISD::BUILTIN_OP_END, Wrapper, // Wrapper - A wrapper node for TargetConstantPool, // TargetExternalSymbol, and TargetGlobalAddress. WrapperPIC, // WrapperPIC - A wrapper node for TargetGlobalAddress in // PIC mode. WrapperJT, // WrapperJT - A wrapper node for TargetJumpTable // Add pseudo op to model memcpy for struct byval. COPY_STRUCT_BYVAL, CALL, // Function call. CALL_PRED, // Function call that's predicable. CALL_NOLINK, // Function call with branch not branch-and-link. BRCOND, // Conditional branch. BR_JT, // Jumptable branch. BR2_JT, // Jumptable branch (2 level - jumptable entry is a jump). RET_FLAG, // Return with a flag operand. INTRET_FLAG, // Interrupt return with an LR-offset and a flag operand. PIC_ADD, // Add with a PC operand and a PIC label. CMP, // ARM compare instructions. CMN, // ARM CMN instructions. CMPZ, // ARM compare that sets only Z flag. CMPFP, // ARM VFP compare instruction, sets FPSCR. CMPFPw0, // ARM VFP compare against zero instruction, sets FPSCR. FMSTAT, // ARM fmstat instruction. CMOV, // ARM conditional move instructions. SSAT, // Signed saturation USAT, // Unsigned saturation BCC_i64, SRL_FLAG, // V,Flag = srl_flag X -> srl X, 1 + save carry out. SRA_FLAG, // V,Flag = sra_flag X -> sra X, 1 + save carry out. RRX, // V = RRX X, Flag -> srl X, 1 + shift in carry flag. ADDC, // Add with carry ADDE, // Add using carry SUBC, // Sub with carry SUBE, // Sub using carry VMOVRRD, // double to two gprs. VMOVDRR, // Two gprs to double. VMOVSR, // move gpr to single, used for f32 literal constructed in a gpr EH_SJLJ_SETJMP, // SjLj exception handling setjmp. EH_SJLJ_LONGJMP, // SjLj exception handling longjmp. EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch. TC_RETURN, // Tail call return pseudo. THREAD_POINTER, DYN_ALLOC, // Dynamic allocation on the stack. MEMBARRIER_MCR, // Memory barrier (MCR) PRELOAD, // Preload WIN__CHKSTK, // Windows' __chkstk call to do stack probing. WIN__DBZCHK, // Windows' divide by zero check VCEQ, // Vector compare equal. VCEQZ, // Vector compare equal to zero. VCGE, // Vector compare greater than or equal. VCGEZ, // Vector compare greater than or equal to zero. VCLEZ, // Vector compare less than or equal to zero. VCGEU, // Vector compare unsigned greater than or equal. VCGT, // Vector compare greater than. VCGTZ, // Vector compare greater than zero. VCLTZ, // Vector compare less than zero. VCGTU, // Vector compare unsigned greater than. VTST, // Vector test bits. // Vector shift by immediate: VSHL, // ...left VSHRs, // ...right (signed) VSHRu, // ...right (unsigned) // Vector rounding shift by immediate: VRSHRs, // ...right (signed) VRSHRu, // ...right (unsigned) VRSHRN, // ...right narrow // Vector saturating shift by immediate: VQSHLs, // ...left (signed) VQSHLu, // ...left (unsigned) VQSHLsu, // ...left (signed to unsigned) VQSHRNs, // ...right narrow (signed) VQSHRNu, // ...right narrow (unsigned) VQSHRNsu, // ...right narrow (signed to unsigned) // Vector saturating rounding shift by immediate: VQRSHRNs, // ...right narrow (signed) VQRSHRNu, // ...right narrow (unsigned) VQRSHRNsu, // ...right narrow (signed to unsigned) // Vector shift and insert: VSLI, // ...left VSRI, // ...right // Vector get lane (VMOV scalar to ARM core register) // (These are used for 8- and 16-bit element types only.) VGETLANEu, // zero-extend vector extract element VGETLANEs, // sign-extend vector extract element // Vector move immediate and move negated immediate: VMOVIMM, VMVNIMM, // Vector move f32 immediate: VMOVFPIMM, // Move H <-> R, clearing top 16 bits VMOVrh, VMOVhr, // Vector duplicate: VDUP, VDUPLANE, // Vector shuffles: VEXT, // extract VREV64, // reverse elements within 64-bit doublewords VREV32, // reverse elements within 32-bit words VREV16, // reverse elements within 16-bit halfwords VZIP, // zip (interleave) VUZP, // unzip (deinterleave) VTRN, // transpose VTBL1, // 1-register shuffle with mask VTBL2, // 2-register shuffle with mask // Vector multiply long: VMULLs, // ...signed VMULLu, // ...unsigned SMULWB, // Signed multiply word by half word, bottom SMULWT, // Signed multiply word by half word, top UMLAL, // 64bit Unsigned Accumulate Multiply SMLAL, // 64bit Signed Accumulate Multiply UMAAL, // 64-bit Unsigned Accumulate Accumulate Multiply SMLALBB, // 64-bit signed accumulate multiply bottom, bottom 16 SMLALBT, // 64-bit signed accumulate multiply bottom, top 16 SMLALTB, // 64-bit signed accumulate multiply top, bottom 16 SMLALTT, // 64-bit signed accumulate multiply top, top 16 SMLALD, // Signed multiply accumulate long dual SMLALDX, // Signed multiply accumulate long dual exchange SMLSLD, // Signed multiply subtract long dual SMLSLDX, // Signed multiply subtract long dual exchange SMMLAR, // Signed multiply long, round and add SMMLSR, // Signed multiply long, subtract and round // Operands of the standard BUILD_VECTOR node are not legalized, which // is fine if BUILD_VECTORs are always lowered to shuffles or other // operations, but for ARM some BUILD_VECTORs are legal as-is and their // operands need to be legalized. Define an ARM-specific version of // BUILD_VECTOR for this purpose. BUILD_VECTOR, // Bit-field insert BFI, // Vector OR with immediate VORRIMM, // Vector AND with NOT of immediate VBICIMM, // Vector bitwise select VBSL, // Pseudo-instruction representing a memory copy using ldm/stm // instructions. MEMCPY, // Vector load N-element structure to all lanes: VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE, VLD2DUP, VLD3DUP, VLD4DUP, // NEON loads with post-increment base updates: VLD1_UPD, VLD2_UPD, VLD3_UPD, VLD4_UPD, VLD2LN_UPD, VLD3LN_UPD, VLD4LN_UPD, VLD1DUP_UPD, VLD2DUP_UPD, VLD3DUP_UPD, VLD4DUP_UPD, // NEON stores with post-increment base updates: VST1_UPD, VST2_UPD, VST3_UPD, VST4_UPD, VST2LN_UPD, VST3LN_UPD, VST4LN_UPD }; } // end namespace ARMISD /// Define some predicates that are used for node matching. namespace ARM { bool isBitFieldInvertedMask(unsigned v); } // end namespace ARM //===--------------------------------------------------------------------===// // ARMTargetLowering - ARM Implementation of the TargetLowering interface class ARMTargetLowering : public TargetLowering { public: explicit ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI); unsigned getJumpTableEncoding() const override; bool useSoftFloat() const override; SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; /// ReplaceNodeResults - Replace the results of node with an illegal result /// type with new values built out of custom code. void ReplaceNodeResults(SDNode *N, SmallVectorImpl&Results, SelectionDAG &DAG) const override; const char *getTargetNodeName(unsigned Opcode) const override; bool isSelectSupported(SelectSupportKind Kind) const override { // ARM does not support scalar condition selects on vectors. return (Kind != ScalarCondVectorVal); } bool isReadOnly(const GlobalValue *GV) const; /// getSetCCResultType - Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override; SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override; /// allowsMisalignedMemoryAccesses - Returns true if the target allows /// unaligned memory accesses of the specified type. Returns whether it /// is "fast" by reference in the second argument. bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, unsigned Align, bool *Fast) const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const override; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; bool isFNegFree(EVT VT) const override; bool isVectorLoadExtDesirable(SDValue ExtVal) const override; bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override; /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I = nullptr) const override; /// getScalingFactorCost - Return the cost of the scaling used in /// addressing mode represented by AM. /// If the AM is supported, the return value must be >= 0. /// If the AM is not supported, the return value must be negative. int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const; /// Returns true if the addresing mode representing by AM is legal /// for the Thumb1 target, for a load/store of the specified type. bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const; /// isLegalICmpImmediate - Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can /// compare a register against the immediate without having to materialize /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; /// isLegalAddImmediate - Return true if the specified immediate is legal /// add immediate, that is the target has add instructions which can /// add a register and the immediate without having to materialize /// the immediate into a register. bool isLegalAddImmediate(int64_t Imm) const override; /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; /// getPostIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if this node can be /// combined with a load / store to form a post-indexed load / store. bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override; void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override; bool ExpandInlineAsm(CallInst *CI) const override; ConstraintType getConstraintType(StringRef Constraint) const override; /// Examine constraint string and operand type and determine a weight value. /// The operand object must already have been set up with the operand type. ConstraintWeight getSingleConstraintMatchWeight( AsmOperandInfo &info, const char *constraint) const override; std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override; const char *LowerXConstraint(EVT ConstraintVT) const override; /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops /// vector. If it is invalid, don't add anything to Ops. If hasMemory is /// true it means one of the asm constraint of the inline asm instruction /// being processed is 'm'. void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint, std::vector &Ops, SelectionDAG &DAG) const override; unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override { if (ConstraintCode == "Q") return InlineAsm::Constraint_Q; else if (ConstraintCode == "o") return InlineAsm::Constraint_o; else if (ConstraintCode.size() == 2) { if (ConstraintCode[0] == 'U') { switch(ConstraintCode[1]) { default: break; case 'm': return InlineAsm::Constraint_Um; case 'n': return InlineAsm::Constraint_Un; case 'q': return InlineAsm::Constraint_Uq; case 's': return InlineAsm::Constraint_Us; case 't': return InlineAsm::Constraint_Ut; case 'v': return InlineAsm::Constraint_Uv; case 'y': return InlineAsm::Constraint_Uy; } } } return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } const ARMSubtarget* getSubtarget() const { return Subtarget; } /// getRegClassFor - Return the register class that should be used for the /// specified value type. const TargetRegisterClass *getRegClassFor(MVT VT) const override; /// Returns true if a cast between SrcAS and DestAS is a noop. bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override { // Addrspacecasts are always noops. return true; } bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, unsigned &PrefAlign) const override; /// createFastISel - This method returns a target specific FastISel object, /// or null if the target does not support "fast" ISel. FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const override; Sched::Preference getSchedulingPreference(SDNode *N) const override; bool isShuffleMaskLegal(ArrayRef M, EVT VT) const override; bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool isFPImmLegal(const APFloat &Imm, EVT VT) const override; bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override; /// Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; /// Return true if EXTRACT_SUBVECTOR is cheap for this result type /// with this index. bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override; /// Returns true if an argument of type Ty needs to be passed in a /// contiguous block of registers in calling convention CallConv. bool functionArgumentNeedsConsecutiveRegisters( Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override; /// If a physical register, this returns the register that receives the /// exception address on entry to an EH pad. unsigned getExceptionPointerRegister(const Constant *PersonalityFn) const override; /// If a physical register, this returns the register that receives the /// exception typeid on entry to a landing pad. unsigned getExceptionSelectorRegister(const Constant *PersonalityFn) const override; Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const; Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord) const override; Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const override; Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, AtomicOrdering Ord) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } bool lowerInterleavedLoad(LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const override; bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; bool shouldInsertFencesForAtomic(const Instruction *I) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override; bool useLoadStackGuardNode() const override; bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override; bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, const SelectionDAG &DAG) const override { // Do not merge to larger than i32. return (MemVT.getSizeInBits() <= 32); } bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; bool convertSetCCLogicToBitwiseLogic(EVT VT) const override { return VT.isScalarInteger(); } bool supportSwiftError() const override { return true; } bool hasStandaloneRem(EVT VT) const override { return HasStandaloneRem; } CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const; CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const; /// Returns true if \p VecTy is a legal interleaved access type. This /// function checks the vector element type and the overall width of the /// vector. bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL) const; /// Returns the number of interleaved accesses that will be generated when /// lowering accesses of the given type. unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const; void finalizeLowering(MachineFunction &MF) const override; /// Return the correct alignment for the current calling convention. unsigned getABIAlignmentForCallingConv(Type *ArgTy, DataLayout DL) const override; + bool isDesirableToCommuteWithShift(const SDNode *N, + CombineLevel Level) const override; + protected: std::pair findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override; private: /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can /// make the right decision when generating code for different targets. const ARMSubtarget *Subtarget; const TargetRegisterInfo *RegInfo; const InstrItineraryData *Itins; /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created. unsigned ARMPCLabelIndex; // TODO: remove this, and have shouldInsertFencesForAtomic do the proper // check. bool InsertFencesForAtomic; bool HasStandaloneRem = true; void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT); void addDRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT); std::pair getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const; using RegsToPassVector = SmallVector, 8>; void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, RegsToPassVector &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, SDValue &StackPtr, SmallVectorImpl &MemOpChains, ISD::ArgFlagsTy Flags) const; SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA, SDValue &Root, SelectionDAG &DAG, const SDLoc &dl) const; CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC, bool isVarArg) const; CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const; SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const; SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const; SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG) const; SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA, SelectionDAG &DAG, TLSModel::Model model) const; SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalTLSAddressWindows(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const; SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const; void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed, SmallVectorImpl &Results) const; SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed, SDValue &Chain) const; SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const; SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const; unsigned getRegisterByName(const char* RegName, EVT VT, SelectionDAG &DAG) const override; /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be /// expanded to FMAs when this method returns true, otherwise fmuladd is /// expanded to fmul + fadd. /// /// ARM supports both fused and unfused multiply-add operations; we already /// lower a pair of fmul and fadd to the latter so it's not clear that there /// would be a gain or that the gain would be worthwhile enough to risk /// correctness bugs. bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; } SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; bool supportSplitCSR(MachineFunction *MF) const override { return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getFunction().hasFnAttribute(Attribute::NoUnwind); } void initializeSplitCSR(MachineBasicBlock *Entry) const override; void insertCopiesSplitCSR( MachineBasicBlock *Entry, const SmallVectorImpl &Exits) const override; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, const Value *OrigArg, unsigned InRegsParamRecordIdx, int ArgOffset, unsigned ArgSize) const; void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl, SDValue &Chain, unsigned ArgOffset, unsigned TotalArgRegsSaveSize, bool ForceMutable = false) const; SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl &InVals) const override; /// HandleByVal - Target-specific cleanup for ByVal support. void HandleByVal(CCState *, unsigned &, unsigned) const override; /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. bool IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SmallVectorImpl &Ins, SelectionDAG& DAG) const; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl &Outs, LLVMContext &Context) const override; SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, const SmallVectorImpl &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override; bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal, SDValue ARMcc, SDValue CCR, SDValue Cmp, SelectionDAG &DAG) const; SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const; SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &dl, bool InvalidOnQNaN) const; SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const; SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const; void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB, MachineBasicBlock *DispatchBB, int FI) const; void EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const; bool RemapAddSubWithFlags(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitStructByval(MachineInstr &MI, MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__chkstk(MachineInstr &MI, MachineBasicBlock *MBB) const; MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI, MachineBasicBlock *MBB) const; }; enum NEONModImmType { VMOVModImm, VMVNModImm, OtherModImm }; namespace ARM { FastISel *createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo); } // end namespace ARM } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H