Index: head/contrib/llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- head/contrib/llvm/include/llvm/CodeGen/TargetLowering.h	(revision 344055)
+++ head/contrib/llvm/include/llvm/CodeGen/TargetLowering.h	(revision 344056)
@@ -1,3700 +1,3704 @@
 //===- llvm/CodeGen/TargetLowering.h - Target Lowering Info -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file describes how to lower LLVM code to machine code.  This has two
 /// main components:
 ///
 ///  1. Which ValueTypes are natively supported by the target.
 ///  2. Which operations are supported for supported ValueTypes.
 ///  3. Cost thresholds for alternative implementations of certain operations.
 ///
 /// In addition it has a few other components, like information about FP
 /// immediates.
 ///
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CODEGEN_TARGETLOWERING_H
 #define LLVM_CODEGEN_TARGETLOWERING_H
 
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <climits>
 #include <cstdint>
 #include <iterator>
 #include <map>
 #include <string>
 #include <utility>
 #include <vector>
 
 namespace llvm {
 
 class BranchProbability;
 class CCState;
 class CCValAssign;
 class Constant;
 class FastISel;
 class FunctionLoweringInfo;
 class GlobalValue;
 class IntrinsicInst;
 struct KnownBits;
 class LLVMContext;
 class MachineBasicBlock;
 class MachineFunction;
 class MachineInstr;
 class MachineJumpTableInfo;
 class MachineLoop;
 class MachineRegisterInfo;
 class MCContext;
 class MCExpr;
 class Module;
 class TargetRegisterClass;
 class TargetLibraryInfo;
 class TargetRegisterInfo;
 class Value;
 
 namespace Sched {
 
   enum Preference {
     None,             // No preference
     Source,           // Follow source order.
     RegPressure,      // Scheduling for lowest register pressure.
     Hybrid,           // Scheduling for both latency and register pressure.
     ILP,              // Scheduling for ILP in low register pressure mode.
     VLIW              // Scheduling for VLIW targets.
   };
 
 } // end namespace Sched
 
 /// This base class for TargetLowering contains the SelectionDAG-independent
 /// parts that can be used from the rest of CodeGen.
 class TargetLoweringBase {
 public:
   /// This enum indicates whether operations are valid for a target, and if not,
   /// what action should be used to make them valid.
   enum LegalizeAction : uint8_t {
     Legal,      // The target natively supports this operation.
     Promote,    // This operation should be executed in a larger type.
     Expand,     // Try to expand this to other ops, otherwise use a libcall.
     LibCall,    // Don't try to expand this to other ops, always use a libcall.
     Custom      // Use the LowerOperation hook to implement custom lowering.
   };
 
   /// This enum indicates whether a types are legal for a target, and if not,
   /// what action should be used to make them valid.
   enum LegalizeTypeAction : uint8_t {
     TypeLegal,           // The target natively supports this type.
     TypePromoteInteger,  // Replace this integer with a larger one.
     TypeExpandInteger,   // Split this integer into two of half the size.
     TypeSoftenFloat,     // Convert this float to a same size integer type,
                          // if an operation is not supported in target HW.
     TypeExpandFloat,     // Split this float into two of half the size.
     TypeScalarizeVector, // Replace this one-element vector with its element.
     TypeSplitVector,     // Split this vector into two of half the size.
     TypeWidenVector,     // This vector should be widened into a larger vector.
     TypePromoteFloat     // Replace this float with a larger one.
   };
 
   /// LegalizeKind holds the legalization kind that needs to happen to EVT
   /// in order to type-legalize it.
   using LegalizeKind = std::pair<LegalizeTypeAction, EVT>;
 
   /// Enum that describes how the target represents true/false values.
   enum BooleanContent {
     UndefinedBooleanContent,    // Only bit 0 counts, the rest can hold garbage.
     ZeroOrOneBooleanContent,        // All bits zero except for bit 0.
     ZeroOrNegativeOneBooleanContent // All bits equal to bit 0.
   };
 
   /// Enum that describes what type of support for selects the target has.
   enum SelectSupportKind {
     ScalarValSelect,      // The target supports scalar selects (ex: cmov).
     ScalarCondVectorVal,  // The target supports selects with a scalar condition
                           // and vector values (ex: cmov).
     VectorMaskSelect      // The target supports vector selects with a vector
                           // mask (ex: x86 blends).
   };
 
   /// Enum that specifies what an atomic load/AtomicRMWInst is expanded
   /// to, if at all. Exists because different targets have different levels of
   /// support for these atomic instructions, and also have different options
   /// w.r.t. what they should expand to.
   enum class AtomicExpansionKind {
     None,    // Don't expand the instruction.
     LLSC,    // Expand the instruction into loadlinked/storeconditional; used
              // by ARM/AArch64.
     LLOnly,  // Expand the (load) instruction into just a load-linked, which has
              // greater atomic guarantees than a normal load.
     CmpXChg, // Expand the instruction into cmpxchg; used by at least X86.
   };
 
   /// Enum that specifies when a multiplication should be expanded.
   enum class MulExpansionKind {
     Always,            // Always expand the instruction.
     OnlyLegalOrCustom, // Only expand when the resulting instructions are legal
                        // or custom.
   };
 
   class ArgListEntry {
   public:
     Value *Val = nullptr;
     SDValue Node = SDValue();
     Type *Ty = nullptr;
     bool IsSExt : 1;
     bool IsZExt : 1;
     bool IsInReg : 1;
     bool IsSRet : 1;
     bool IsNest : 1;
     bool IsByVal : 1;
     bool IsInAlloca : 1;
     bool IsReturned : 1;
     bool IsSwiftSelf : 1;
     bool IsSwiftError : 1;
     uint16_t Alignment = 0;
 
     ArgListEntry()
         : IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false),
           IsNest(false), IsByVal(false), IsInAlloca(false), IsReturned(false),
           IsSwiftSelf(false), IsSwiftError(false) {}
 
     void setAttributes(ImmutableCallSite *CS, unsigned ArgIdx);
   };
   using ArgListTy = std::vector<ArgListEntry>;
 
   virtual void markLibCallAttributes(MachineFunction *MF, unsigned CC,
                                      ArgListTy &Args) const {};
 
   static ISD::NodeType getExtendForContent(BooleanContent Content) {
     switch (Content) {
     case UndefinedBooleanContent:
       // Extend by adding rubbish bits.
       return ISD::ANY_EXTEND;
     case ZeroOrOneBooleanContent:
       // Extend by adding zero bits.
       return ISD::ZERO_EXTEND;
     case ZeroOrNegativeOneBooleanContent:
       // Extend by copying the sign bit.
       return ISD::SIGN_EXTEND;
     }
     llvm_unreachable("Invalid content kind");
   }
 
   /// NOTE: The TargetMachine owns TLOF.
   explicit TargetLoweringBase(const TargetMachine &TM);
   TargetLoweringBase(const TargetLoweringBase &) = delete;
   TargetLoweringBase &operator=(const TargetLoweringBase &) = delete;
   virtual ~TargetLoweringBase() = default;
 
 protected:
   /// Initialize all of the actions to default values.
   void initActions();
 
 public:
   const TargetMachine &getTargetMachine() const { return TM; }
 
   virtual bool useSoftFloat() const { return false; }
 
   /// Return the pointer type for the given address space, defaults to
   /// the pointer type from the data layout.
   /// FIXME: The default needs to be removed once all the code is updated.
   MVT getPointerTy(const DataLayout &DL, uint32_t AS = 0) const {
     return MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
   }
 
   /// Return the type for frame index, which is determined by
   /// the alloca address space specified through the data layout.
   MVT getFrameIndexTy(const DataLayout &DL) const {
     return getPointerTy(DL, DL.getAllocaAddrSpace());
   }
 
   /// Return the type for operands of fence.
   /// TODO: Let fence operands be of i32 type and remove this.
   virtual MVT getFenceOperandTy(const DataLayout &DL) const {
     return getPointerTy(DL);
   }
 
   /// EVT is not used in-tree, but is used by out-of-tree target.
   /// A documentation for this function would be nice...
   virtual MVT getScalarShiftAmountTy(const DataLayout &, EVT) const;
 
   EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL,
                        bool LegalTypes = true) const;
 
   /// Returns the type to be used for the index operand of:
   /// ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
   /// ISD::INSERT_SUBVECTOR, and ISD::EXTRACT_SUBVECTOR
   virtual MVT getVectorIdxTy(const DataLayout &DL) const {
     return getPointerTy(DL);
   }
 
   virtual bool isSelectSupported(SelectSupportKind /*kind*/) const {
     return true;
   }
 
   /// Return true if multiple condition registers are available.
   bool hasMultipleConditionRegisters() const {
     return HasMultipleConditionRegisters;
   }
 
   /// Return true if the target has BitExtract instructions.
   bool hasExtractBitsInsn() const { return HasExtractBitsInsn; }
 
   /// Return the preferred vector type legalization action.
   virtual TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const {
     // The default action for one element vectors is to scalarize
     if (VT.getVectorNumElements() == 1)
       return TypeScalarizeVector;
     // The default action for other vectors is to promote
     return TypePromoteInteger;
   }
 
   // There are two general methods for expanding a BUILD_VECTOR node:
   //  1. Use SCALAR_TO_VECTOR on the defined scalar values and then shuffle
   //     them together.
   //  2. Build the vector on the stack and then load it.
   // If this function returns true, then method (1) will be used, subject to
   // the constraint that all of the necessary shuffles are legal (as determined
   // by isShuffleMaskLegal). If this function returns false, then method (2) is
   // always used. The vector type, and the number of defined values, are
   // provided.
   virtual bool
   shouldExpandBuildVectorWithShuffles(EVT /* VT */,
                                       unsigned DefinedValues) const {
     return DefinedValues < 3;
   }
 
   /// Return true if integer divide is usually cheaper than a sequence of
   /// several shifts, adds, and multiplies for this target.
   /// The definition of "cheaper" may depend on whether we're optimizing
   /// for speed or for size.
   virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const { return false; }
 
   /// Return true if the target can handle a standalone remainder operation.
   virtual bool hasStandaloneRem(EVT VT) const {
     return true;
   }
 
   /// Return true if SQRT(X) shouldn't be replaced with X*RSQRT(X).
   virtual bool isFsqrtCheap(SDValue X, SelectionDAG &DAG) const {
     // Default behavior is to replace SQRT(X) with X*RSQRT(X).
     return false;
   }
 
   /// Reciprocal estimate status values used by the functions below.
   enum ReciprocalEstimate : int {
     Unspecified = -1,
     Disabled = 0,
     Enabled = 1
   };
 
   /// Return a ReciprocalEstimate enum value for a square root of the given type
   /// based on the function's attributes. If the operation is not overridden by
   /// the function's attributes, "Unspecified" is returned and target defaults
   /// are expected to be used for instruction selection.
   int getRecipEstimateSqrtEnabled(EVT VT, MachineFunction &MF) const;
 
   /// Return a ReciprocalEstimate enum value for a division of the given type
   /// based on the function's attributes. If the operation is not overridden by
   /// the function's attributes, "Unspecified" is returned and target defaults
   /// are expected to be used for instruction selection.
   int getRecipEstimateDivEnabled(EVT VT, MachineFunction &MF) const;
 
   /// Return the refinement step count for a square root of the given type based
   /// on the function's attributes. If the operation is not overridden by
   /// the function's attributes, "Unspecified" is returned and target defaults
   /// are expected to be used for instruction selection.
   int getSqrtRefinementSteps(EVT VT, MachineFunction &MF) const;
 
   /// Return the refinement step count for a division of the given type based
   /// on the function's attributes. If the operation is not overridden by
   /// the function's attributes, "Unspecified" is returned and target defaults
   /// are expected to be used for instruction selection.
   int getDivRefinementSteps(EVT VT, MachineFunction &MF) const;
 
   /// Returns true if target has indicated at least one type should be bypassed.
   bool isSlowDivBypassed() const { return !BypassSlowDivWidths.empty(); }
 
   /// Returns map of slow types for division or remainder with corresponding
   /// fast types
   const DenseMap<unsigned int, unsigned int> &getBypassSlowDivWidths() const {
     return BypassSlowDivWidths;
   }
 
   /// Return true if Flow Control is an expensive operation that should be
   /// avoided.
   bool isJumpExpensive() const { return JumpIsExpensive; }
 
   /// Return true if selects are only cheaper than branches if the branch is
   /// unlikely to be predicted right.
   bool isPredictableSelectExpensive() const {
     return PredictableSelectIsExpensive;
   }
 
   /// If a branch or a select condition is skewed in one direction by more than
   /// this factor, it is very likely to be predicted correctly.
   virtual BranchProbability getPredictableBranchThreshold() const;
 
   /// Return true if the following transform is beneficial:
   /// fold (conv (load x)) -> (load (conv*)x)
   /// On architectures that don't natively support some vector loads
   /// efficiently, casting the load to a smaller vector of larger types and
   /// loading is more efficient, however, this can be undone by optimizations in
   /// dag combiner.
   virtual bool isLoadBitCastBeneficial(EVT LoadVT,
                                        EVT BitcastVT) const {
     // Don't do if we could do an indexed load on the original type, but not on
     // the new one.
     if (!LoadVT.isSimple() || !BitcastVT.isSimple())
       return true;
 
     MVT LoadMVT = LoadVT.getSimpleVT();
 
     // Don't bother doing this if it's just going to be promoted again later, as
     // doing so might interfere with other combines.
     if (getOperationAction(ISD::LOAD, LoadMVT) == Promote &&
         getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT())
       return false;
 
     return true;
   }
 
   /// Return true if the following transform is beneficial:
   /// (store (y (conv x)), y*)) -> (store x, (x*))
   virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT) const {
     // Default to the same logic as loads.
     return isLoadBitCastBeneficial(StoreVT, BitcastVT);
   }
 
   /// Return true if it is expected to be cheaper to do a store of a non-zero
   /// vector constant with the given size and type for the address space than to
   /// store the individual scalar element constants.
   virtual bool storeOfVectorConstantIsCheap(EVT MemVT,
                                             unsigned NumElem,
                                             unsigned AddrSpace) const {
     return false;
   }
 
   /// Allow store merging after legalization in addition to before legalization.
   /// This may catch stores that do not exist earlier (eg, stores created from
   /// intrinsics).
   virtual bool mergeStoresAfterLegalization() const { return true; }
 
   /// Returns if it's reasonable to merge stores to MemVT size.
   virtual bool canMergeStoresTo(unsigned AS, EVT MemVT,
                                 const SelectionDAG &DAG) const {
     return true;
   }
 
   /// Return true if it is cheap to speculate a call to intrinsic cttz.
   virtual bool isCheapToSpeculateCttz() const {
     return false;
   }
 
   /// Return true if it is cheap to speculate a call to intrinsic ctlz.
   virtual bool isCheapToSpeculateCtlz() const {
     return false;
   }
 
   /// Return true if ctlz instruction is fast.
   virtual bool isCtlzFast() const {
     return false;
   }
 
   /// Return true if it is safe to transform an integer-domain bitwise operation
   /// into the equivalent floating-point operation. This should be set to true
   /// if the target has IEEE-754-compliant fabs/fneg operations for the input
   /// type.
   virtual bool hasBitPreservingFPLogic(EVT VT) const {
     return false;
   }
 
   /// Return true if it is cheaper to split the store of a merged int val
   /// from a pair of smaller values into multiple stores.
   virtual bool isMultiStoresCheaperThanBitsMerge(EVT LTy, EVT HTy) const {
     return false;
   }
 
   /// Return if the target supports combining a
   /// chain like:
   /// \code
   ///   %andResult = and %val1, #mask
   ///   %icmpResult = icmp %andResult, 0
   /// \endcode
   /// into a single machine instruction of a form like:
   /// \code
   ///   cc = test %register, #mask
   /// \endcode
   virtual bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
     return false;
   }
 
   /// Use bitwise logic to make pairs of compares more efficient. For example:
   /// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
   /// This should be true when it takes more than one instruction to lower
   /// setcc (cmp+set on x86 scalar), when bitwise ops are faster than logic on
   /// condition bits (crand on PowerPC), and/or when reducing cmp+br is a win.
   virtual bool convertSetCCLogicToBitwiseLogic(EVT VT) const {
     return false;
   }
 
   /// Return the preferred operand type if the target has a quick way to compare
   /// integer values of the given size. Assume that any legal integer type can
   /// be compared efficiently. Targets may override this to allow illegal wide
   /// types to return a vector type if there is support to compare that type.
   virtual MVT hasFastEqualityCompare(unsigned NumBits) const {
     MVT VT = MVT::getIntegerVT(NumBits);
     return isTypeLegal(VT) ? VT : MVT::INVALID_SIMPLE_VALUE_TYPE;
   }
 
   /// Return true if the target should transform:
   /// (X & Y) == Y ---> (~X & Y) == 0
   /// (X & Y) != Y ---> (~X & Y) != 0
   ///
   /// This may be profitable if the target has a bitwise and-not operation that
   /// sets comparison flags. A target may want to limit the transformation based
   /// on the type of Y or if Y is a constant.
   ///
   /// Note that the transform will not occur if Y is known to be a power-of-2
   /// because a mask and compare of a single bit can be handled by inverting the
   /// predicate, for example:
   /// (X & 8) == 8 ---> (X & 8) != 0
   virtual bool hasAndNotCompare(SDValue Y) const {
     return false;
   }
 
   /// Return true if the target has a bitwise and-not operation:
   /// X = ~A & B
   /// This can be used to simplify select or other instructions.
   virtual bool hasAndNot(SDValue X) const {
     // If the target has the more complex version of this operation, assume that
     // it has this operation too.
     return hasAndNotCompare(X);
   }
 
   /// There are two ways to clear extreme bits (either low or high):
   /// Mask:    x &  (-1 << y)  (the instcombine canonical form)
   /// Shifts:  x >> y << y
   /// Return true if the variant with 2 shifts is preferred.
   /// Return false if there is no preference.
   virtual bool preferShiftsToClearExtremeBits(SDValue X) const {
     // By default, let's assume that no one prefers shifts.
     return false;
   }
 
   /// Should we tranform the IR-optimal check for whether given truncation
   /// down into KeptBits would be truncating or not:
   ///   (add %x, (1 << (KeptBits-1))) srccond (1 << KeptBits)
   /// Into it's more traditional form:
   ///   ((%x << C) a>> C) dstcond %x
   /// Return true if we should transform.
   /// Return false if there is no preference.
   virtual bool shouldTransformSignedTruncationCheck(EVT XVT,
                                                     unsigned KeptBits) const {
     // By default, let's assume that no one prefers shifts.
     return false;
   }
 
   /// Return true if the target wants to use the optimization that
   /// turns ext(promotableInst1(...(promotableInstN(load)))) into
   /// promotedInst1(...(promotedInstN(ext(load)))).
   bool enableExtLdPromotion() const { return EnableExtLdPromotion; }
 
   /// Return true if the target can combine store(extractelement VectorTy,
   /// Idx).
   /// \p Cost[out] gives the cost of that transformation when this is true.
   virtual bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                          unsigned &Cost) const {
     return false;
   }
 
   /// Return true if target supports floating point exceptions.
   bool hasFloatingPointExceptions() const {
     return HasFloatingPointExceptions;
   }
 
   /// Return true if target always beneficiates from combining into FMA for a
   /// given value type. This must typically return false on targets where FMA
   /// takes more cycles to execute than FADD.
   virtual bool enableAggressiveFMAFusion(EVT VT) const {
     return false;
   }
 
   /// Return the ValueType of the result of SETCC operations.
   virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                                  EVT VT) const;
 
   /// Return the ValueType for comparison libcalls. Comparions libcalls include
   /// floating point comparion calls, and Ordered/Unordered check calls on
   /// floating point numbers.
   virtual
   MVT::SimpleValueType getCmpLibcallReturnType() const;
 
   /// For targets without i1 registers, this gives the nature of the high-bits
   /// of boolean values held in types wider than i1.
   ///
   /// "Boolean values" are special true/false values produced by nodes like
   /// SETCC and consumed (as the condition) by nodes like SELECT and BRCOND.
   /// Not to be confused with general values promoted from i1.  Some cpus
   /// distinguish between vectors of boolean and scalars; the isVec parameter
   /// selects between the two kinds.  For example on X86 a scalar boolean should
   /// be zero extended from i1, while the elements of a vector of booleans
   /// should be sign extended from i1.
   ///
   /// Some cpus also treat floating point types the same way as they treat
   /// vectors instead of the way they treat scalars.
   BooleanContent getBooleanContents(bool isVec, bool isFloat) const {
     if (isVec)
       return BooleanVectorContents;
     return isFloat ? BooleanFloatContents : BooleanContents;
   }
 
   BooleanContent getBooleanContents(EVT Type) const {
     return getBooleanContents(Type.isVector(), Type.isFloatingPoint());
   }
 
   /// Return target scheduling preference.
   Sched::Preference getSchedulingPreference() const {
     return SchedPreferenceInfo;
   }
 
   /// Some scheduler, e.g. hybrid, can switch to different scheduling heuristics
   /// for different nodes. This function returns the preference (or none) for
   /// the given node.
   virtual Sched::Preference getSchedulingPreference(SDNode *) const {
     return Sched::None;
   }
 
   /// Return the register class that should be used for the specified value
   /// type.
   virtual const TargetRegisterClass *getRegClassFor(MVT VT) const {
     const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
     assert(RC && "This value type is not natively supported!");
     return RC;
   }
 
   /// Return the 'representative' register class for the specified value
   /// type.
   ///
   /// The 'representative' register class is the largest legal super-reg
   /// register class for the register class of the value type.  For example, on
   /// i386 the rep register class for i8, i16, and i32 are GR32; while the rep
   /// register class is GR64 on x86_64.
   virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
     const TargetRegisterClass *RC = RepRegClassForVT[VT.SimpleTy];
     return RC;
   }
 
   /// Return the cost of the 'representative' register class for the specified
   /// value type.
   virtual uint8_t getRepRegClassCostFor(MVT VT) const {
     return RepRegClassCostForVT[VT.SimpleTy];
   }
 
   /// Return true if the target has native support for the specified value type.
   /// This means that it has a register that directly holds it without
   /// promotions or expansions.
   bool isTypeLegal(EVT VT) const {
     assert(!VT.isSimple() ||
            (unsigned)VT.getSimpleVT().SimpleTy < array_lengthof(RegClassForVT));
     return VT.isSimple() && RegClassForVT[VT.getSimpleVT().SimpleTy] != nullptr;
   }
 
   class ValueTypeActionImpl {
     /// ValueTypeActions - For each value type, keep a LegalizeTypeAction enum
     /// that indicates how instruction selection should deal with the type.
     LegalizeTypeAction ValueTypeActions[MVT::LAST_VALUETYPE];
 
   public:
     ValueTypeActionImpl() {
       std::fill(std::begin(ValueTypeActions), std::end(ValueTypeActions),
                 TypeLegal);
     }
 
     LegalizeTypeAction getTypeAction(MVT VT) const {
       return ValueTypeActions[VT.SimpleTy];
     }
 
     void setTypeAction(MVT VT, LegalizeTypeAction Action) {
       ValueTypeActions[VT.SimpleTy] = Action;
     }
   };
 
   const ValueTypeActionImpl &getValueTypeActions() const {
     return ValueTypeActions;
   }
 
   /// Return how we should legalize values of this type, either it is already
   /// legal (return 'Legal') or we need to promote it to a larger type (return
   /// 'Promote'), or we need to expand it into multiple registers of smaller
   /// integer type (return 'Expand').  'Custom' is not an option.
   LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const {
     return getTypeConversion(Context, VT).first;
   }
   LegalizeTypeAction getTypeAction(MVT VT) const {
     return ValueTypeActions.getTypeAction(VT);
   }
 
   /// For types supported by the target, this is an identity function.  For
   /// types that must be promoted to larger types, this returns the larger type
   /// to promote to.  For integer types that are larger than the largest integer
   /// register, this contains one step in the expansion to get to the smaller
   /// register. For illegal floating point types, this returns the integer type
   /// to transform to.
   EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const {
     return getTypeConversion(Context, VT).second;
   }
 
   /// For types supported by the target, this is an identity function.  For
   /// types that must be expanded (i.e. integer types that are larger than the
   /// largest integer register or illegal floating point types), this returns
   /// the largest legal type it will be expanded to.
   EVT getTypeToExpandTo(LLVMContext &Context, EVT VT) const {
     assert(!VT.isVector());
     while (true) {
       switch (getTypeAction(Context, VT)) {
       case TypeLegal:
         return VT;
       case TypeExpandInteger:
         VT = getTypeToTransformTo(Context, VT);
         break;
       default:
         llvm_unreachable("Type is not legal nor is it to be expanded!");
       }
     }
   }
 
   /// Vector types are broken down into some number of legal first class types.
   /// For example, EVT::v8f32 maps to 2 EVT::v4f32 with Altivec or SSE1, or 8
   /// promoted EVT::f64 values with the X86 FP stack.  Similarly, EVT::v2i64
   /// turns into 4 EVT::i32 values with both PPC and X86.
   ///
   /// This method returns the number of registers needed, and the VT for each
   /// register.  It also returns the VT and quantity of the intermediate values
   /// before they are promoted/expanded.
   unsigned getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
                                   EVT &IntermediateVT,
                                   unsigned &NumIntermediates,
                                   MVT &RegisterVT) const;
 
   /// Certain targets such as MIPS require that some types such as vectors are
   /// always broken down into scalars in some contexts. This occurs even if the
   /// vector type is legal.
   virtual unsigned getVectorTypeBreakdownForCallingConv(
       LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
       unsigned &NumIntermediates, MVT &RegisterVT) const {
     return getVectorTypeBreakdown(Context, VT, IntermediateVT, NumIntermediates,
                                   RegisterVT);
   }
 
   struct IntrinsicInfo {
     unsigned     opc = 0;          // target opcode
     EVT          memVT;            // memory VT
 
     // value representing memory location
     PointerUnion<const Value *, const PseudoSourceValue *> ptrVal;
 
     int          offset = 0;       // offset off of ptrVal
     unsigned     size = 0;         // the size of the memory location
                                    // (taken from memVT if zero)
     unsigned     align = 1;        // alignment
 
     MachineMemOperand::Flags flags = MachineMemOperand::MONone;
     IntrinsicInfo() = default;
   };
 
   /// Given an intrinsic, checks if on the target the intrinsic will need to map
   /// to a MemIntrinsicNode (touches memory). If this is the case, it returns
   /// true and store the intrinsic information into the IntrinsicInfo that was
   /// passed to the function.
   virtual bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
                                   MachineFunction &,
                                   unsigned /*Intrinsic*/) const {
     return false;
   }
 
   /// Returns true if the target can instruction select the specified FP
   /// immediate natively. If false, the legalizer will materialize the FP
   /// immediate as a load from a constant pool.
   virtual bool isFPImmLegal(const APFloat &/*Imm*/, EVT /*VT*/) const {
     return false;
   }
 
   /// Targets can use this to indicate that they only support *some*
   /// VECTOR_SHUFFLE operations, those with specific masks.  By default, if a
   /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to be
   /// legal.
   virtual bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const {
     return true;
   }
 
   /// Returns true if the operation can trap for the value type.
   ///
   /// VT must be a legal type. By default, we optimistically assume most
   /// operations don't trap except for integer divide and remainder.
   virtual bool canOpTrap(unsigned Op, EVT VT) const;
 
   /// Similar to isShuffleMaskLegal. Targets can use this to indicate if there
   /// is a suitable VECTOR_SHUFFLE that can be used to replace a VAND with a
   /// constant pool entry.
   virtual bool isVectorClearMaskLegal(ArrayRef<int> /*Mask*/,
                                       EVT /*VT*/) const {
     return false;
   }
 
   /// Return how this operation should be treated: either it is legal, needs to
   /// be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.
   LegalizeAction getOperationAction(unsigned Op, EVT VT) const {
     if (VT.isExtended()) return Expand;
     // If a target-specific SDNode requires legalization, require the target
     // to provide custom legalization for it.
     if (Op >= array_lengthof(OpActions[0])) return Custom;
     return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op];
   }
 
   LegalizeAction getStrictFPOperationAction(unsigned Op, EVT VT) const {
     unsigned EqOpc;
     switch (Op) {
       default: llvm_unreachable("Unexpected FP pseudo-opcode");
       case ISD::STRICT_FADD: EqOpc = ISD::FADD; break;
       case ISD::STRICT_FSUB: EqOpc = ISD::FSUB; break;
       case ISD::STRICT_FMUL: EqOpc = ISD::FMUL; break;
       case ISD::STRICT_FDIV: EqOpc = ISD::FDIV; break;
       case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
       case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
       case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
       case ISD::STRICT_FMA: EqOpc = ISD::FMA; break;
       case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
       case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
       case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
       case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break;
       case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break;
       case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break;
       case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
       case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
       case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
     }
 
     auto Action = getOperationAction(EqOpc, VT);
 
     // We don't currently handle Custom or Promote for strict FP pseudo-ops.
     // For now, we just expand for those cases.
     if (Action != Legal)
       Action = Expand;
 
     return Action;
   }
 
   /// Return true if the specified operation is legal on this target or can be
   /// made legal with custom lowering. This is used to help guide high-level
   /// lowering decisions.
   bool isOperationLegalOrCustom(unsigned Op, EVT VT) const {
     return (VT == MVT::Other || isTypeLegal(VT)) &&
       (getOperationAction(Op, VT) == Legal ||
        getOperationAction(Op, VT) == Custom);
   }
 
   /// Return true if the specified operation is legal on this target or can be
   /// made legal using promotion. This is used to help guide high-level lowering
   /// decisions.
   bool isOperationLegalOrPromote(unsigned Op, EVT VT) const {
     return (VT == MVT::Other || isTypeLegal(VT)) &&
       (getOperationAction(Op, VT) == Legal ||
        getOperationAction(Op, VT) == Promote);
   }
 
   /// Return true if the specified operation is legal on this target or can be
   /// made legal with custom lowering or using promotion. This is used to help
   /// guide high-level lowering decisions.
   bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT) const {
     return (VT == MVT::Other || isTypeLegal(VT)) &&
       (getOperationAction(Op, VT) == Legal ||
        getOperationAction(Op, VT) == Custom ||
        getOperationAction(Op, VT) == Promote);
   }
 
   /// Return true if the operation uses custom lowering, regardless of whether
   /// the type is legal or not.
   bool isOperationCustom(unsigned Op, EVT VT) const {
     return getOperationAction(Op, VT) == Custom;
   }
 
   /// Return true if lowering to a jump table is allowed.
   virtual bool areJTsAllowed(const Function *Fn) const {
     if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")
       return false;
 
     return isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
            isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
   }
 
   /// Check whether the range [Low,High] fits in a machine word.
   bool rangeFitsInWord(const APInt &Low, const APInt &High,
                        const DataLayout &DL) const {
     // FIXME: Using the pointer type doesn't seem ideal.
     uint64_t BW = DL.getIndexSizeInBits(0u);
     uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
     return Range <= BW;
   }
 
   /// Return true if lowering to a jump table is suitable for a set of case
   /// clusters which may contain \p NumCases cases, \p Range range of values.
   /// FIXME: This function check the maximum table size and density, but the
   /// minimum size is not checked. It would be nice if the minimum size is
   /// also combined within this function. Currently, the minimum size check is
   /// performed in findJumpTable() in SelectionDAGBuiler and
   /// getEstimatedNumberOfCaseClusters() in BasicTTIImpl.
   virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases,
                                       uint64_t Range) const {
     const bool OptForSize = SI->getParent()->getParent()->optForSize();
     const unsigned MinDensity = getMinimumJumpTableDensity(OptForSize);
     const unsigned MaxJumpTableSize =
         OptForSize || getMaximumJumpTableSize() == 0
             ? UINT_MAX
             : getMaximumJumpTableSize();
     // Check whether a range of clusters is dense enough for a jump table.
     if (Range <= MaxJumpTableSize &&
         (NumCases * 100 >= Range * MinDensity)) {
       return true;
     }
     return false;
   }
 
   /// Return true if lowering to a bit test is suitable for a set of case
   /// clusters which contains \p NumDests unique destinations, \p Low and
   /// \p High as its lowest and highest case values, and expects \p NumCmps
   /// case value comparisons. Check if the number of destinations, comparison
   /// metric, and range are all suitable.
   bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
                              const APInt &Low, const APInt &High,
                              const DataLayout &DL) const {
     // FIXME: I don't think NumCmps is the correct metric: a single case and a
     // range of cases both require only one branch to lower. Just looking at the
     // number of clusters and destinations should be enough to decide whether to
     // build bit tests.
 
     // To lower a range with bit tests, the range must fit the bitwidth of a
     // machine word.
     if (!rangeFitsInWord(Low, High, DL))
       return false;
 
     // Decide whether it's profitable to lower this range with bit tests. Each
     // destination requires a bit test and branch, and there is an overall range
     // check branch. For a small number of clusters, separate comparisons might
     // be cheaper, and for many destinations, splitting the range might be
     // better.
     return (NumDests == 1 && NumCmps >= 3) || (NumDests == 2 && NumCmps >= 5) ||
            (NumDests == 3 && NumCmps >= 6);
   }
 
   /// Return true if the specified operation is illegal on this target or
   /// unlikely to be made legal with custom lowering. This is used to help guide
   /// high-level lowering decisions.
   bool isOperationExpand(unsigned Op, EVT VT) const {
     return (!isTypeLegal(VT) || getOperationAction(Op, VT) == Expand);
   }
 
   /// Return true if the specified operation is legal on this target.
   bool isOperationLegal(unsigned Op, EVT VT) const {
     return (VT == MVT::Other || isTypeLegal(VT)) &&
            getOperationAction(Op, VT) == Legal;
   }
 
   /// Return how this load with extension should be treated: either it is legal,
   /// needs to be promoted to a larger size, needs to be expanded to some other
   /// code sequence, or the target has a custom expander for it.
   LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT,
                                   EVT MemVT) const {
     if (ValVT.isExtended() || MemVT.isExtended()) return Expand;
     unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy;
     unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
     assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValI < MVT::LAST_VALUETYPE &&
            MemI < MVT::LAST_VALUETYPE && "Table isn't big enough!");
     unsigned Shift = 4 * ExtType;
     return (LegalizeAction)((LoadExtActions[ValI][MemI] >> Shift) & 0xf);
   }
 
   /// Return true if the specified load with extension is legal on this target.
   bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const {
     return getLoadExtAction(ExtType, ValVT, MemVT) == Legal;
   }
 
   /// Return true if the specified load with extension is legal or custom
   /// on this target.
   bool isLoadExtLegalOrCustom(unsigned ExtType, EVT ValVT, EVT MemVT) const {
     return getLoadExtAction(ExtType, ValVT, MemVT) == Legal ||
            getLoadExtAction(ExtType, ValVT, MemVT) == Custom;
   }
 
   /// Return how this store with truncation should be treated: either it is
   /// legal, needs to be promoted to a larger size, needs to be expanded to some
   /// other code sequence, or the target has a custom expander for it.
   LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const {
     if (ValVT.isExtended() || MemVT.isExtended()) return Expand;
     unsigned ValI = (unsigned) ValVT.getSimpleVT().SimpleTy;
     unsigned MemI = (unsigned) MemVT.getSimpleVT().SimpleTy;
     assert(ValI < MVT::LAST_VALUETYPE && MemI < MVT::LAST_VALUETYPE &&
            "Table isn't big enough!");
     return TruncStoreActions[ValI][MemI];
   }
 
   /// Return true if the specified store with truncation is legal on this
   /// target.
   bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const {
     return isTypeLegal(ValVT) && getTruncStoreAction(ValVT, MemVT) == Legal;
   }
 
   /// Return true if the specified store with truncation has solution on this
   /// target.
   bool isTruncStoreLegalOrCustom(EVT ValVT, EVT MemVT) const {
     return isTypeLegal(ValVT) &&
       (getTruncStoreAction(ValVT, MemVT) == Legal ||
        getTruncStoreAction(ValVT, MemVT) == Custom);
   }
 
   /// Return how the indexed load should be treated: either it is legal, needs
   /// to be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.
   LegalizeAction
   getIndexedLoadAction(unsigned IdxMode, MVT VT) const {
     assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
            "Table isn't big enough!");
     unsigned Ty = (unsigned)VT.SimpleTy;
     return (LegalizeAction)((IndexedModeActions[Ty][IdxMode] & 0xf0) >> 4);
   }
 
   /// Return true if the specified indexed load is legal on this target.
   bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const {
     return VT.isSimple() &&
       (getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Legal ||
        getIndexedLoadAction(IdxMode, VT.getSimpleVT()) == Custom);
   }
 
   /// Return how the indexed store should be treated: either it is legal, needs
   /// to be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.
   LegalizeAction
   getIndexedStoreAction(unsigned IdxMode, MVT VT) const {
     assert(IdxMode < ISD::LAST_INDEXED_MODE && VT.isValid() &&
            "Table isn't big enough!");
     unsigned Ty = (unsigned)VT.SimpleTy;
     return (LegalizeAction)(IndexedModeActions[Ty][IdxMode] & 0x0f);
   }
 
   /// Return true if the specified indexed load is legal on this target.
   bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const {
     return VT.isSimple() &&
       (getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Legal ||
        getIndexedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
   }
 
   /// Return how the condition code should be treated: either it is legal, needs
   /// to be expanded to some other code sequence, or the target has a custom
   /// expander for it.
   LegalizeAction
   getCondCodeAction(ISD::CondCode CC, MVT VT) const {
     assert((unsigned)CC < array_lengthof(CondCodeActions) &&
            ((unsigned)VT.SimpleTy >> 3) < array_lengthof(CondCodeActions[0]) &&
            "Table isn't big enough!");
     // See setCondCodeAction for how this is encoded.
     uint32_t Shift = 4 * (VT.SimpleTy & 0x7);
     uint32_t Value = CondCodeActions[CC][VT.SimpleTy >> 3];
     LegalizeAction Action = (LegalizeAction) ((Value >> Shift) & 0xF);
     assert(Action != Promote && "Can't promote condition code!");
     return Action;
   }
 
   /// Return true if the specified condition code is legal on this target.
   bool isCondCodeLegal(ISD::CondCode CC, MVT VT) const {
     return getCondCodeAction(CC, VT) == Legal;
   }
 
   /// Return true if the specified condition code is legal or custom on this
   /// target.
   bool isCondCodeLegalOrCustom(ISD::CondCode CC, MVT VT) const {
     return getCondCodeAction(CC, VT) == Legal ||
            getCondCodeAction(CC, VT) == Custom;
   }
 
   /// If the action for this operation is to promote, this method returns the
   /// ValueType to promote to.
   MVT getTypeToPromoteTo(unsigned Op, MVT VT) const {
     assert(getOperationAction(Op, VT) == Promote &&
            "This operation isn't promoted!");
 
     // See if this has an explicit type specified.
     std::map<std::pair<unsigned, MVT::SimpleValueType>,
              MVT::SimpleValueType>::const_iterator PTTI =
       PromoteToType.find(std::make_pair(Op, VT.SimpleTy));
     if (PTTI != PromoteToType.end()) return PTTI->second;
 
     assert((VT.isInteger() || VT.isFloatingPoint()) &&
            "Cannot autopromote this type, add it with AddPromotedToType.");
 
     MVT NVT = VT;
     do {
       NVT = (MVT::SimpleValueType)(NVT.SimpleTy+1);
       assert(NVT.isInteger() == VT.isInteger() && NVT != MVT::isVoid &&
              "Didn't find type to promote to!");
     } while (!isTypeLegal(NVT) ||
               getOperationAction(Op, NVT) == Promote);
     return NVT;
   }
 
   /// Return the EVT corresponding to this LLVM type.  This is fixed by the LLVM
   /// operations except for the pointer size.  If AllowUnknown is true, this
   /// will return MVT::Other for types with no EVT counterpart (e.g. structs),
   /// otherwise it will assert.
   EVT getValueType(const DataLayout &DL, Type *Ty,
                    bool AllowUnknown = false) const {
     // Lower scalar pointers to native pointer types.
     if (PointerType *PTy = dyn_cast<PointerType>(Ty))
       return getPointerTy(DL, PTy->getAddressSpace());
 
     if (Ty->isVectorTy()) {
       VectorType *VTy = cast<VectorType>(Ty);
       Type *Elm = VTy->getElementType();
       // Lower vectors of pointers to native pointer types.
       if (PointerType *PT = dyn_cast<PointerType>(Elm)) {
         EVT PointerTy(getPointerTy(DL, PT->getAddressSpace()));
         Elm = PointerTy.getTypeForEVT(Ty->getContext());
       }
 
       return EVT::getVectorVT(Ty->getContext(), EVT::getEVT(Elm, false),
                        VTy->getNumElements());
     }
     return EVT::getEVT(Ty, AllowUnknown);
   }
 
   /// Return the MVT corresponding to this LLVM type. See getValueType.
   MVT getSimpleValueType(const DataLayout &DL, Type *Ty,
                          bool AllowUnknown = false) const {
     return getValueType(DL, Ty, AllowUnknown).getSimpleVT();
   }
 
   /// Return the desired alignment for ByVal or InAlloca aggregate function
   /// arguments in the caller parameter area.  This is the actual alignment, not
   /// its logarithm.
   virtual unsigned getByValTypeAlignment(Type *Ty, const DataLayout &DL) const;
 
   /// Return the type of registers that this ValueType will eventually require.
   MVT getRegisterType(MVT VT) const {
     assert((unsigned)VT.SimpleTy < array_lengthof(RegisterTypeForVT));
     return RegisterTypeForVT[VT.SimpleTy];
   }
 
   /// Return the type of registers that this ValueType will eventually require.
   MVT getRegisterType(LLVMContext &Context, EVT VT) const {
     if (VT.isSimple()) {
       assert((unsigned)VT.getSimpleVT().SimpleTy <
                 array_lengthof(RegisterTypeForVT));
       return RegisterTypeForVT[VT.getSimpleVT().SimpleTy];
     }
     if (VT.isVector()) {
       EVT VT1;
       MVT RegisterVT;
       unsigned NumIntermediates;
       (void)getVectorTypeBreakdown(Context, VT, VT1,
                                    NumIntermediates, RegisterVT);
       return RegisterVT;
     }
     if (VT.isInteger()) {
       return getRegisterType(Context, getTypeToTransformTo(Context, VT));
     }
     llvm_unreachable("Unsupported extended type!");
   }
 
   /// Return the number of registers that this ValueType will eventually
   /// require.
   ///
   /// This is one for any types promoted to live in larger registers, but may be
   /// more than one for types (like i64) that are split into pieces.  For types
   /// like i140, which are first promoted then expanded, it is the number of
   /// registers needed to hold all the bits of the original type.  For an i140
   /// on a 32 bit machine this means 5 registers.
   unsigned getNumRegisters(LLVMContext &Context, EVT VT) const {
     if (VT.isSimple()) {
       assert((unsigned)VT.getSimpleVT().SimpleTy <
                 array_lengthof(NumRegistersForVT));
       return NumRegistersForVT[VT.getSimpleVT().SimpleTy];
     }
     if (VT.isVector()) {
       EVT VT1;
       MVT VT2;
       unsigned NumIntermediates;
       return getVectorTypeBreakdown(Context, VT, VT1, NumIntermediates, VT2);
     }
     if (VT.isInteger()) {
       unsigned BitWidth = VT.getSizeInBits();
       unsigned RegWidth = getRegisterType(Context, VT).getSizeInBits();
       return (BitWidth + RegWidth - 1) / RegWidth;
     }
     llvm_unreachable("Unsupported extended type!");
   }
 
   /// Certain combinations of ABIs, Targets and features require that types
   /// are legal for some operations and not for other operations.
   /// For MIPS all vector types must be passed through the integer register set.
   virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context,
                                             CallingConv::ID CC, EVT VT) const {
     return getRegisterType(Context, VT);
   }
 
   /// Certain targets require unusual breakdowns of certain types. For MIPS,
   /// this occurs when a vector type is used, as vector are passed through the
   /// integer register set.
   virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context,
                                                  CallingConv::ID CC,
                                                  EVT VT) const {
     return getNumRegisters(Context, VT);
   }
 
   /// Certain targets have context senstive alignment requirements, where one
   /// type has the alignment requirement of another type.
   virtual unsigned getABIAlignmentForCallingConv(Type *ArgTy,
                                                  DataLayout DL) const {
     return DL.getABITypeAlignment(ArgTy);
   }
 
   /// If true, then instruction selection should seek to shrink the FP constant
   /// of the specified type to a smaller type in order to save space and / or
   /// reduce runtime.
   virtual bool ShouldShrinkFPConstant(EVT) const { return true; }
 
   // Return true if it is profitable to reduce the given load node to a smaller
   // type.
   //
   // e.g. (i16 (trunc (i32 (load x))) -> i16 load x should be performed
   virtual bool shouldReduceLoadWidth(SDNode *Load,
                                      ISD::LoadExtType ExtTy,
                                      EVT NewVT) const {
     return true;
   }
 
   /// When splitting a value of the specified type into parts, does the Lo
   /// or Hi part come first?  This usually follows the endianness, except
   /// for ppcf128, where the Hi part always comes first.
   bool hasBigEndianPartOrdering(EVT VT, const DataLayout &DL) const {
     return DL.isBigEndian() || VT == MVT::ppcf128;
   }
 
   /// If true, the target has custom DAG combine transformations that it can
   /// perform for the specified node.
   bool hasTargetDAGCombine(ISD::NodeType NT) const {
     assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
     return TargetDAGCombineArray[NT >> 3] & (1 << (NT&7));
   }
 
   unsigned getGatherAllAliasesMaxDepth() const {
     return GatherAllAliasesMaxDepth;
   }
 
   /// Returns the size of the platform's va_list object.
   virtual unsigned getVaListSizeInBits(const DataLayout &DL) const {
     return getPointerTy(DL).getSizeInBits();
   }
 
   /// Get maximum # of store operations permitted for llvm.memset
   ///
   /// This function returns the maximum number of store operations permitted
   /// to replace a call to llvm.memset. The value is set by the target at the
   /// performance threshold for such a replacement. If OptSize is true,
   /// return the limit for functions that have OptSize attribute.
   unsigned getMaxStoresPerMemset(bool OptSize) const {
     return OptSize ? MaxStoresPerMemsetOptSize : MaxStoresPerMemset;
   }
 
   /// Get maximum # of store operations permitted for llvm.memcpy
   ///
   /// This function returns the maximum number of store operations permitted
   /// to replace a call to llvm.memcpy. The value is set by the target at the
   /// performance threshold for such a replacement. If OptSize is true,
   /// return the limit for functions that have OptSize attribute.
   unsigned getMaxStoresPerMemcpy(bool OptSize) const {
     return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
   }
 
   /// \brief Get maximum # of store operations to be glued together
   ///
   /// This function returns the maximum number of store operations permitted
   /// to glue together during lowering of llvm.memcpy. The value is set by
   //  the target at the performance threshold for such a replacement.
   virtual unsigned getMaxGluedStoresPerMemcpy() const {
     return MaxGluedStoresPerMemcpy;
   }
 
   /// Get maximum # of load operations permitted for memcmp
   ///
   /// This function returns the maximum number of load operations permitted
   /// to replace a call to memcmp. The value is set by the target at the
   /// performance threshold for such a replacement. If OptSize is true,
   /// return the limit for functions that have OptSize attribute.
   unsigned getMaxExpandSizeMemcmp(bool OptSize) const {
     return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
   }
 
   /// For memcmp expansion when the memcmp result is only compared equal or
   /// not-equal to 0, allow up to this number of load pairs per block. As an
   /// example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
   ///   a0 = load2bytes &a[0]
   ///   b0 = load2bytes &b[0]
   ///   a2 = load1byte  &a[2]
   ///   b2 = load1byte  &b[2]
   ///   r  = cmp eq (a0 ^ b0 | a2 ^ b2), 0
   virtual unsigned getMemcmpEqZeroLoadsPerBlock() const {
     return 1;
   }
 
   /// Get maximum # of store operations permitted for llvm.memmove
   ///
   /// This function returns the maximum number of store operations permitted
   /// to replace a call to llvm.memmove. The value is set by the target at the
   /// performance threshold for such a replacement. If OptSize is true,
   /// return the limit for functions that have OptSize attribute.
   unsigned getMaxStoresPerMemmove(bool OptSize) const {
     return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove;
   }
 
   /// Determine if the target supports unaligned memory accesses.
   ///
   /// This function returns true if the target allows unaligned memory accesses
   /// of the specified type in the given address space. If true, it also returns
   /// whether the unaligned memory access is "fast" in the last argument by
   /// reference. This is used, for example, in situations where an array
   /// copy/move/set is converted to a sequence of store operations. Its use
   /// helps to ensure that such replacements don't generate code that causes an
   /// alignment error (trap) on the target machine.
   virtual bool allowsMisalignedMemoryAccesses(EVT,
                                               unsigned AddrSpace = 0,
                                               unsigned Align = 1,
                                               bool * /*Fast*/ = nullptr) const {
     return false;
   }
 
   /// Return true if the target supports a memory access of this type for the
   /// given address space and alignment. If the access is allowed, the optional
   /// final parameter returns if the access is also fast (as defined by the
   /// target).
   bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
                           unsigned AddrSpace = 0, unsigned Alignment = 1,
                           bool *Fast = nullptr) const;
 
   /// Returns the target specific optimal type for load and store operations as
   /// a result of memset, memcpy, and memmove lowering.
   ///
   /// If DstAlign is zero that means it's safe to destination alignment can
   /// satisfy any constraint. Similarly if SrcAlign is zero it means there isn't
   /// a need to check it against alignment requirement, probably because the
   /// source does not need to be loaded. If 'IsMemset' is true, that means it's
   /// expanding a memset. If 'ZeroMemset' is true, that means it's a memset of
   /// zero. 'MemcpyStrSrc' indicates whether the memcpy source is constant so it
   /// does not need to be loaded.  It returns EVT::Other if the type should be
   /// determined using generic target-independent logic.
   virtual EVT getOptimalMemOpType(uint64_t /*Size*/,
                                   unsigned /*DstAlign*/, unsigned /*SrcAlign*/,
                                   bool /*IsMemset*/,
                                   bool /*ZeroMemset*/,
                                   bool /*MemcpyStrSrc*/,
                                   MachineFunction &/*MF*/) const {
     return MVT::Other;
   }
 
   /// Returns true if it's safe to use load / store of the specified type to
   /// expand memcpy / memset inline.
   ///
   /// This is mostly true for all types except for some special cases. For
   /// example, on X86 targets without SSE2 f64 load / store are done with fldl /
   /// fstpl which also does type conversion. Note the specified type doesn't
   /// have to be legal as the hook is used before type legalization.
   virtual bool isSafeMemOpType(MVT /*VT*/) const { return true; }
 
   /// Determine if we should use _setjmp or setjmp to implement llvm.setjmp.
   bool usesUnderscoreSetJmp() const {
     return UseUnderscoreSetJmp;
   }
 
   /// Determine if we should use _longjmp or longjmp to implement llvm.longjmp.
   bool usesUnderscoreLongJmp() const {
     return UseUnderscoreLongJmp;
   }
 
   /// Return lower limit for number of blocks in a jump table.
   virtual unsigned getMinimumJumpTableEntries() const;
 
   /// Return lower limit of the density in a jump table.
   unsigned getMinimumJumpTableDensity(bool OptForSize) const;
 
   /// Return upper limit for number of entries in a jump table.
   /// Zero if no limit.
   unsigned getMaximumJumpTableSize() const;
 
   virtual bool isJumpTableRelative() const {
     return TM.isPositionIndependent();
   }
 
   /// If a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
   unsigned getStackPointerRegisterToSaveRestore() const {
     return StackPointerRegisterToSaveRestore;
   }
 
   /// If a physical register, this returns the register that receives the
   /// exception address on entry to an EH pad.
   virtual unsigned
   getExceptionPointerRegister(const Constant *PersonalityFn) const {
     // 0 is guaranteed to be the NoRegister value on all targets
     return 0;
   }
 
   /// If a physical register, this returns the register that receives the
   /// exception typeid on entry to a landing pad.
   virtual unsigned
   getExceptionSelectorRegister(const Constant *PersonalityFn) const {
     // 0 is guaranteed to be the NoRegister value on all targets
     return 0;
   }
 
   virtual bool needsFixedCatchObjects() const {
     report_fatal_error("Funclet EH is not implemented for this target");
   }
 
   /// Returns the target's jmp_buf size in bytes (if never set, the default is
   /// 200)
   unsigned getJumpBufSize() const {
     return JumpBufSize;
   }
 
   /// Returns the target's jmp_buf alignment in bytes (if never set, the default
   /// is 0)
   unsigned getJumpBufAlignment() const {
     return JumpBufAlignment;
   }
 
   /// Return the minimum stack alignment of an argument.
   unsigned getMinStackArgumentAlignment() const {
     return MinStackArgumentAlignment;
   }
 
   /// Return the minimum function alignment.
   unsigned getMinFunctionAlignment() const {
     return MinFunctionAlignment;
   }
 
   /// Return the preferred function alignment.
   unsigned getPrefFunctionAlignment() const {
     return PrefFunctionAlignment;
   }
 
   /// Return the preferred loop alignment.
   virtual unsigned getPrefLoopAlignment(MachineLoop *ML = nullptr) const {
     return PrefLoopAlignment;
   }
 
   /// If the target has a standard location for the stack protector guard,
   /// returns the address of that location. Otherwise, returns nullptr.
   /// DEPRECATED: please override useLoadStackGuardNode and customize
   ///             LOAD_STACK_GUARD, or customize \@llvm.stackguard().
   virtual Value *getIRStackGuard(IRBuilder<> &IRB) const;
 
   /// Inserts necessary declarations for SSP (stack protection) purpose.
   /// Should be used only when getIRStackGuard returns nullptr.
   virtual void insertSSPDeclarations(Module &M) const;
 
   /// Return the variable that's previously inserted by insertSSPDeclarations,
   /// if any, otherwise return nullptr. Should be used only when
   /// getIRStackGuard returns nullptr.
   virtual Value *getSDagStackGuard(const Module &M) const;
 
   /// If this function returns true, stack protection checks should XOR the
   /// frame pointer (or whichever pointer is used to address locals) into the
   /// stack guard value before checking it. getIRStackGuard must return nullptr
   /// if this returns true.
   virtual bool useStackGuardXorFP() const { return false; }
 
   /// If the target has a standard stack protection check function that
   /// performs validation and error handling, returns the function. Otherwise,
   /// returns nullptr. Must be previously inserted by insertSSPDeclarations.
   /// Should be used only when getIRStackGuard returns nullptr.
   virtual Value *getSSPStackGuardCheck(const Module &M) const;
 
 protected:
   Value *getDefaultSafeStackPointerLocation(IRBuilder<> &IRB,
                                             bool UseTLS) const;
 
 public:
   /// Returns the target-specific address of the unsafe stack pointer.
   virtual Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const;
 
   /// Returns the name of the symbol used to emit stack probes or the empty
   /// string if not applicable.
   virtual StringRef getStackProbeSymbolName(MachineFunction &MF) const {
     return "";
   }
 
   /// Returns true if a cast between SrcAS and DestAS is a noop.
   virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
     return false;
   }
 
   /// Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g. we
   /// are happy to sink it into basic blocks.
   virtual bool isCheapAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
     return isNoopAddrSpaceCast(SrcAS, DestAS);
   }
 
   /// Return true if the pointer arguments to CI should be aligned by aligning
   /// the object whose address is being passed. If so then MinSize is set to the
   /// minimum size the object must be to be aligned and PrefAlign is set to the
   /// preferred alignment.
   virtual bool shouldAlignPointerArgs(CallInst * /*CI*/, unsigned & /*MinSize*/,
                                       unsigned & /*PrefAlign*/) const {
     return false;
   }
 
   //===--------------------------------------------------------------------===//
   /// \name Helpers for TargetTransformInfo implementations
   /// @{
 
   /// Get the ISD node that corresponds to the Instruction class opcode.
   int InstructionOpcodeToISD(unsigned Opcode) const;
 
   /// Estimate the cost of type-legalization and the legalized type.
   std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL,
                                               Type *Ty) const;
 
   /// @}
 
   //===--------------------------------------------------------------------===//
   /// \name Helpers for atomic expansion.
   /// @{
 
   /// Returns the maximum atomic operation size (in bits) supported by
   /// the backend. Atomic operations greater than this size (as well
   /// as ones that are not naturally aligned), will be expanded by
   /// AtomicExpandPass into an __atomic_* library call.
   unsigned getMaxAtomicSizeInBitsSupported() const {
     return MaxAtomicSizeInBitsSupported;
   }
 
   /// Returns the size of the smallest cmpxchg or ll/sc instruction
   /// the backend supports.  Any smaller operations are widened in
   /// AtomicExpandPass.
   ///
   /// Note that *unlike* operations above the maximum size, atomic ops
   /// are still natively supported below the minimum; they just
   /// require a more complex expansion.
   unsigned getMinCmpXchgSizeInBits() const { return MinCmpXchgSizeInBits; }
 
   /// Whether the target supports unaligned atomic operations.
   bool supportsUnalignedAtomics() const { return SupportsUnalignedAtomics; }
 
   /// Whether AtomicExpandPass should automatically insert fences and reduce
   /// ordering for this atomic. This should be true for most architectures with
   /// weak memory ordering. Defaults to false.
   virtual bool shouldInsertFencesForAtomic(const Instruction *I) const {
     return false;
   }
 
   /// Perform a load-linked operation on Addr, returning a "Value *" with the
   /// corresponding pointee type. This may entail some non-trivial operations to
   /// truncate or reconstruct types that will be illegal in the backend. See
   /// ARMISelLowering for an example implementation.
   virtual Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                 AtomicOrdering Ord) const {
     llvm_unreachable("Load linked unimplemented on this target");
   }
 
   /// Perform a store-conditional operation to Addr. Return the status of the
   /// store. This should be 0 if the store succeeded, non-zero otherwise.
   virtual Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                                       Value *Addr, AtomicOrdering Ord) const {
     llvm_unreachable("Store conditional unimplemented on this target");
   }
 
   /// Inserts in the IR a target-specific intrinsic specifying a fence.
   /// It is called by AtomicExpandPass before expanding an
   ///   AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad
   ///   if shouldInsertFencesForAtomic returns true.
   ///
   /// Inst is the original atomic instruction, prior to other expansions that
   /// may be performed.
   ///
   /// This function should either return a nullptr, or a pointer to an IR-level
   ///   Instruction*. Even complex fence sequences can be represented by a
   ///   single Instruction* through an intrinsic to be lowered later.
   /// Backends should override this method to produce target-specific intrinsic
   ///   for their fences.
   /// FIXME: Please note that the default implementation here in terms of
   ///   IR-level fences exists for historical/compatibility reasons and is
   ///   *unsound* ! Fences cannot, in general, be used to restore sequential
   ///   consistency. For example, consider the following example:
   /// atomic<int> x = y = 0;
   /// int r1, r2, r3, r4;
   /// Thread 0:
   ///   x.store(1);
   /// Thread 1:
   ///   y.store(1);
   /// Thread 2:
   ///   r1 = x.load();
   ///   r2 = y.load();
   /// Thread 3:
   ///   r3 = y.load();
   ///   r4 = x.load();
   ///  r1 = r3 = 1 and r2 = r4 = 0 is impossible as long as the accesses are all
   ///  seq_cst. But if they are lowered to monotonic accesses, no amount of
   ///  IR-level fences can prevent it.
   /// @{
   virtual Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
                                         AtomicOrdering Ord) const {
     if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore())
       return Builder.CreateFence(Ord);
     else
       return nullptr;
   }
 
   virtual Instruction *emitTrailingFence(IRBuilder<> &Builder,
                                          Instruction *Inst,
                                          AtomicOrdering Ord) const {
     if (isAcquireOrStronger(Ord))
       return Builder.CreateFence(Ord);
     else
       return nullptr;
   }
   /// @}
 
   // Emits code that executes when the comparison result in the ll/sc
   // expansion of a cmpxchg instruction is such that the store-conditional will
   // not execute.  This makes it possible to balance out the load-linked with
   // a dedicated instruction, if desired.
   // E.g., on ARM, if ldrex isn't followed by strex, the exclusive monitor would
   // be unnecessarily held, except if clrex, inserted by this hook, is executed.
   virtual void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const {}
 
   /// Returns true if the given (atomic) store should be expanded by the
   /// IR-level AtomicExpand pass into an "atomic xchg" which ignores its input.
   virtual bool shouldExpandAtomicStoreInIR(StoreInst *SI) const {
     return false;
   }
 
   /// Returns true if arguments should be sign-extended in lib calls.
   virtual bool shouldSignExtendTypeInLibCall(EVT Type, bool IsSigned) const {
     return IsSigned;
   }
 
   /// Returns how the given (atomic) load should be expanded by the
   /// IR-level AtomicExpand pass.
   virtual AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const {
     return AtomicExpansionKind::None;
   }
 
   /// Returns true if the given atomic cmpxchg should be expanded by the
   /// IR-level AtomicExpand pass into a load-linked/store-conditional sequence
   /// (through emitLoadLinked() and emitStoreConditional()).
   virtual bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const {
     return false;
   }
 
   /// Returns how the IR-level AtomicExpand pass should expand the given
   /// AtomicRMW, if at all. Default is to never expand.
   virtual AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const {
     return AtomicExpansionKind::None;
   }
 
   /// On some platforms, an AtomicRMW that never actually modifies the value
   /// (such as fetch_add of 0) can be turned into a fence followed by an
   /// atomic load. This may sound useless, but it makes it possible for the
   /// processor to keep the cacheline shared, dramatically improving
   /// performance. And such idempotent RMWs are useful for implementing some
   /// kinds of locks, see for example (justification + benchmarks):
   /// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf
   /// This method tries doing that transformation, returning the atomic load if
   /// it succeeds, and nullptr otherwise.
   /// If shouldExpandAtomicLoadInIR returns true on that load, it will undergo
   /// another round of expansion.
   virtual LoadInst *
   lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *RMWI) const {
     return nullptr;
   }
 
   /// Returns how the platform's atomic operations are extended (ZERO_EXTEND,
   /// SIGN_EXTEND, or ANY_EXTEND).
   virtual ISD::NodeType getExtendForAtomicOps() const {
     return ISD::ZERO_EXTEND;
   }
 
   /// @}
 
   /// Returns true if we should normalize
   /// select(N0&N1, X, Y) => select(N0, select(N1, X, Y), Y) and
   /// select(N0|N1, X, Y) => select(N0, select(N1, X, Y, Y)) if it is likely
   /// that it saves us from materializing N0 and N1 in an integer register.
   /// Targets that are able to perform and/or on flags should return false here.
   virtual bool shouldNormalizeToSelectSequence(LLVMContext &Context,
                                                EVT VT) const {
     // If a target has multiple condition registers, then it likely has logical
     // operations on those registers.
     if (hasMultipleConditionRegisters())
       return false;
     // Only do the transform if the value won't be split into multiple
     // registers.
     LegalizeTypeAction Action = getTypeAction(Context, VT);
     return Action != TypeExpandInteger && Action != TypeExpandFloat &&
       Action != TypeSplitVector;
   }
 
   /// Return true if a select of constants (select Cond, C1, C2) should be
   /// transformed into simple math ops with the condition value. For example:
   /// select Cond, C1, C1-1 --> add (zext Cond), C1-1
   virtual bool convertSelectOfConstantsToMath(EVT VT) const {
     return false;
   }
 
   //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
   //
 protected:
   /// Specify how the target extends the result of integer and floating point
   /// boolean values from i1 to a wider type.  See getBooleanContents.
   void setBooleanContents(BooleanContent Ty) {
     BooleanContents = Ty;
     BooleanFloatContents = Ty;
   }
 
   /// Specify how the target extends the result of integer and floating point
   /// boolean values from i1 to a wider type.  See getBooleanContents.
   void setBooleanContents(BooleanContent IntTy, BooleanContent FloatTy) {
     BooleanContents = IntTy;
     BooleanFloatContents = FloatTy;
   }
 
   /// Specify how the target extends the result of a vector boolean value from a
   /// vector of i1 to a wider type.  See getBooleanContents.
   void setBooleanVectorContents(BooleanContent Ty) {
     BooleanVectorContents = Ty;
   }
 
   /// Specify the target scheduling preference.
   void setSchedulingPreference(Sched::Preference Pref) {
     SchedPreferenceInfo = Pref;
   }
 
   /// Indicate whether this target prefers to use _setjmp to implement
   /// llvm.setjmp or the version without _.  Defaults to false.
   void setUseUnderscoreSetJmp(bool Val) {
     UseUnderscoreSetJmp = Val;
   }
 
   /// Indicate whether this target prefers to use _longjmp to implement
   /// llvm.longjmp or the version without _.  Defaults to false.
   void setUseUnderscoreLongJmp(bool Val) {
     UseUnderscoreLongJmp = Val;
   }
 
   /// Indicate the minimum number of blocks to generate jump tables.
   void setMinimumJumpTableEntries(unsigned Val);
 
   /// Indicate the maximum number of entries in jump tables.
   /// Set to zero to generate unlimited jump tables.
   void setMaximumJumpTableSize(unsigned);
 
   /// If set to a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
   void setStackPointerRegisterToSaveRestore(unsigned R) {
     StackPointerRegisterToSaveRestore = R;
   }
 
   /// Tells the code generator that the target has multiple (allocatable)
   /// condition registers that can be used to store the results of comparisons
   /// for use by selects and conditional branches. With multiple condition
   /// registers, the code generator will not aggressively sink comparisons into
   /// the blocks of their users.
   void setHasMultipleConditionRegisters(bool hasManyRegs = true) {
     HasMultipleConditionRegisters = hasManyRegs;
   }
 
   /// Tells the code generator that the target has BitExtract instructions.
   /// The code generator will aggressively sink "shift"s into the blocks of
   /// their users if the users will generate "and" instructions which can be
   /// combined with "shift" to BitExtract instructions.
   void setHasExtractBitsInsn(bool hasExtractInsn = true) {
     HasExtractBitsInsn = hasExtractInsn;
   }
 
   /// Tells the code generator not to expand logic operations on comparison
   /// predicates into separate sequences that increase the amount of flow
   /// control.
   void setJumpIsExpensive(bool isExpensive = true);
 
   /// Tells the code generator that this target supports floating point
   /// exceptions and cares about preserving floating point exception behavior.
   void setHasFloatingPointExceptions(bool FPExceptions = true) {
     HasFloatingPointExceptions = FPExceptions;
   }
 
   /// Tells the code generator which bitwidths to bypass.
   void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth) {
     BypassSlowDivWidths[SlowBitWidth] = FastBitWidth;
   }
 
   /// Add the specified register class as an available regclass for the
   /// specified value type. This indicates the selector can handle values of
   /// that class natively.
   void addRegisterClass(MVT VT, const TargetRegisterClass *RC) {
     assert((unsigned)VT.SimpleTy < array_lengthof(RegClassForVT));
     RegClassForVT[VT.SimpleTy] = RC;
   }
 
   /// Return the largest legal super-reg register class of the register class
   /// for the specified type and its associated "cost".
   virtual std::pair<const TargetRegisterClass *, uint8_t>
   findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const;
 
   /// Once all of the register classes are added, this allows us to compute
   /// derived properties we expose.
   void computeRegisterProperties(const TargetRegisterInfo *TRI);
 
   /// Indicate that the specified operation does not work with the specified
   /// type and indicate what to do about it. Note that VT may refer to either
   /// the type of a result or that of an operand of Op.
   void setOperationAction(unsigned Op, MVT VT,
                           LegalizeAction Action) {
     assert(Op < array_lengthof(OpActions[0]) && "Table isn't big enough!");
     OpActions[(unsigned)VT.SimpleTy][Op] = Action;
   }
 
   /// Indicate that the specified load with extension does not work with the
   /// specified type and indicate what to do about it.
   void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT,
                         LegalizeAction Action) {
     assert(ExtType < ISD::LAST_LOADEXT_TYPE && ValVT.isValid() &&
            MemVT.isValid() && "Table isn't big enough!");
     assert((unsigned)Action < 0x10 && "too many bits for bitfield array");
     unsigned Shift = 4 * ExtType;
     LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] &= ~((uint16_t)0xF << Shift);
     LoadExtActions[ValVT.SimpleTy][MemVT.SimpleTy] |= (uint16_t)Action << Shift;
   }
 
   /// Indicate that the specified truncating store does not work with the
   /// specified type and indicate what to do about it.
   void setTruncStoreAction(MVT ValVT, MVT MemVT,
                            LegalizeAction Action) {
     assert(ValVT.isValid() && MemVT.isValid() && "Table isn't big enough!");
     TruncStoreActions[(unsigned)ValVT.SimpleTy][MemVT.SimpleTy] = Action;
   }
 
   /// Indicate that the specified indexed load does or does not work with the
   /// specified type and indicate what to do abort it.
   ///
   /// NOTE: All indexed mode loads are initialized to Expand in
   /// TargetLowering.cpp
   void setIndexedLoadAction(unsigned IdxMode, MVT VT,
                             LegalizeAction Action) {
     assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
            (unsigned)Action < 0xf && "Table isn't big enough!");
     // Load action are kept in the upper half.
     IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0xf0;
     IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action) <<4;
   }
 
   /// Indicate that the specified indexed store does or does not work with the
   /// specified type and indicate what to do about it.
   ///
   /// NOTE: All indexed mode stores are initialized to Expand in
   /// TargetLowering.cpp
   void setIndexedStoreAction(unsigned IdxMode, MVT VT,
                              LegalizeAction Action) {
     assert(VT.isValid() && IdxMode < ISD::LAST_INDEXED_MODE &&
            (unsigned)Action < 0xf && "Table isn't big enough!");
     // Store action are kept in the lower half.
     IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] &= ~0x0f;
     IndexedModeActions[(unsigned)VT.SimpleTy][IdxMode] |= ((uint8_t)Action);
   }
 
   /// Indicate that the specified condition code is or isn't supported on the
   /// target and indicate what to do about it.
   void setCondCodeAction(ISD::CondCode CC, MVT VT,
                          LegalizeAction Action) {
     assert(VT.isValid() && (unsigned)CC < array_lengthof(CondCodeActions) &&
            "Table isn't big enough!");
     assert((unsigned)Action < 0x10 && "too many bits for bitfield array");
     /// The lower 3 bits of the SimpleTy index into Nth 4bit set from the 32-bit
     /// value and the upper 29 bits index into the second dimension of the array
     /// to select what 32-bit value to use.
     uint32_t Shift = 4 * (VT.SimpleTy & 0x7);
     CondCodeActions[CC][VT.SimpleTy >> 3] &= ~((uint32_t)0xF << Shift);
     CondCodeActions[CC][VT.SimpleTy >> 3] |= (uint32_t)Action << Shift;
   }
 
   /// If Opc/OrigVT is specified as being promoted, the promotion code defaults
   /// to trying a larger integer/fp until it can find one that works. If that
   /// default is insufficient, this method can be used by the target to override
   /// the default.
   void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT) {
     PromoteToType[std::make_pair(Opc, OrigVT.SimpleTy)] = DestVT.SimpleTy;
   }
 
   /// Convenience method to set an operation to Promote and specify the type
   /// in a single call.
   void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT) {
     setOperationAction(Opc, OrigVT, Promote);
     AddPromotedToType(Opc, OrigVT, DestVT);
   }
 
   /// Targets should invoke this method for each target independent node that
   /// they want to provide a custom DAG combiner for by implementing the
   /// PerformDAGCombine virtual method.
   void setTargetDAGCombine(ISD::NodeType NT) {
     assert(unsigned(NT >> 3) < array_lengthof(TargetDAGCombineArray));
     TargetDAGCombineArray[NT >> 3] |= 1 << (NT&7);
   }
 
   /// Set the target's required jmp_buf buffer size (in bytes); default is 200
   void setJumpBufSize(unsigned Size) {
     JumpBufSize = Size;
   }
 
   /// Set the target's required jmp_buf buffer alignment (in bytes); default is
   /// 0
   void setJumpBufAlignment(unsigned Align) {
     JumpBufAlignment = Align;
   }
 
   /// Set the target's minimum function alignment (in log2(bytes))
   void setMinFunctionAlignment(unsigned Align) {
     MinFunctionAlignment = Align;
   }
 
   /// Set the target's preferred function alignment.  This should be set if
   /// there is a performance benefit to higher-than-minimum alignment (in
   /// log2(bytes))
   void setPrefFunctionAlignment(unsigned Align) {
     PrefFunctionAlignment = Align;
   }
 
   /// Set the target's preferred loop alignment. Default alignment is zero, it
   /// means the target does not care about loop alignment.  The alignment is
   /// specified in log2(bytes). The target may also override
   /// getPrefLoopAlignment to provide per-loop values.
   void setPrefLoopAlignment(unsigned Align) {
     PrefLoopAlignment = Align;
   }
 
   /// Set the minimum stack alignment of an argument (in log2(bytes)).
   void setMinStackArgumentAlignment(unsigned Align) {
     MinStackArgumentAlignment = Align;
   }
 
   /// Set the maximum atomic operation size supported by the
   /// backend. Atomic operations greater than this size (as well as
   /// ones that are not naturally aligned), will be expanded by
   /// AtomicExpandPass into an __atomic_* library call.
   void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits) {
     MaxAtomicSizeInBitsSupported = SizeInBits;
   }
 
   /// Sets the minimum cmpxchg or ll/sc size supported by the backend.
   void setMinCmpXchgSizeInBits(unsigned SizeInBits) {
     MinCmpXchgSizeInBits = SizeInBits;
   }
 
   /// Sets whether unaligned atomic operations are supported.
   void setSupportsUnalignedAtomics(bool UnalignedSupported) {
     SupportsUnalignedAtomics = UnalignedSupported;
   }
 
 public:
   //===--------------------------------------------------------------------===//
   // Addressing mode description hooks (used by LSR etc).
   //
 
   /// CodeGenPrepare sinks address calculations into the same BB as Load/Store
   /// instructions reading the address. This allows as much computation as
   /// possible to be done in the address mode for that operand. This hook lets
   /// targets also pass back when this should be done on intrinsics which
   /// load/store.
   virtual bool getAddrModeArguments(IntrinsicInst * /*I*/,
                                     SmallVectorImpl<Value*> &/*Ops*/,
                                     Type *&/*AccessTy*/) const {
     return false;
   }
 
   /// This represents an addressing mode of:
   ///    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
   /// If BaseGV is null,  there is no BaseGV.
   /// If BaseOffs is zero, there is no base offset.
   /// If HasBaseReg is false, there is no base register.
   /// If Scale is zero, there is no ScaleReg.  Scale of 1 indicates a reg with
   /// no scale.
   struct AddrMode {
     GlobalValue *BaseGV = nullptr;
     int64_t      BaseOffs = 0;
     bool         HasBaseReg = false;
     int64_t      Scale = 0;
     AddrMode() = default;
   };
 
   /// Return true if the addressing mode represented by AM is legal for this
   /// target, for a load/store of the specified type.
   ///
   /// The type may be VoidTy, in which case only return true if the addressing
   /// mode is legal for a load/store of any legal type.  TODO: Handle
   /// pre/postinc as well.
   ///
   /// If the address space cannot be determined, it will be -1.
   ///
   /// TODO: Remove default argument
   virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
                                      Type *Ty, unsigned AddrSpace,
                                      Instruction *I = nullptr) const;
 
   /// Return the cost of the scaling factor used in the addressing mode
   /// represented by AM for this target, for a load/store of the specified type.
   ///
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, it returns a negative value.
   /// TODO: Handle pre/postinc as well.
   /// TODO: Remove default argument
   virtual int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM,
                                    Type *Ty, unsigned AS = 0) const {
     // Default: assume that any scaling factor used in a legal AM is free.
     if (isLegalAddressingMode(DL, AM, Ty, AS))
       return 0;
     return -1;
   }
 
   /// Return true if the specified immediate is legal icmp immediate, that is
   /// the target has icmp instructions which can compare a register against the
   /// immediate without having to materialize the immediate into a register.
   virtual bool isLegalICmpImmediate(int64_t) const {
     return true;
   }
 
   /// Return true if the specified immediate is legal add immediate, that is the
   /// target has add instructions which can add a register with the immediate
   /// without having to materialize the immediate into a register.
   virtual bool isLegalAddImmediate(int64_t) const {
     return true;
   }
 
   /// Return true if it's significantly cheaper to shift a vector by a uniform
   /// scalar than by an amount which will vary across each lane. On x86, for
   /// example, there is a "psllw" instruction for the former case, but no simple
   /// instruction for a general "a << b" operation on vectors.
   virtual bool isVectorShiftByScalarCheap(Type *Ty) const {
     return false;
   }
 
   /// Returns true if the opcode is a commutative binary operation.
   virtual bool isCommutativeBinOp(unsigned Opcode) const {
     // FIXME: This should get its info from the td file.
     switch (Opcode) {
     case ISD::ADD:
     case ISD::SMIN:
     case ISD::SMAX:
     case ISD::UMIN:
     case ISD::UMAX:
     case ISD::MUL:
     case ISD::MULHU:
     case ISD::MULHS:
     case ISD::SMUL_LOHI:
     case ISD::UMUL_LOHI:
     case ISD::FADD:
     case ISD::FMUL:
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
     case ISD::SADDO:
     case ISD::UADDO:
     case ISD::ADDC:
     case ISD::ADDE:
     case ISD::FMINNUM:
     case ISD::FMAXNUM:
     case ISD::FMINNAN:
     case ISD::FMAXNAN:
       return true;
     default: return false;
     }
   }
 
   /// Return true if it's free to truncate a value of type FromTy to type
   /// ToTy. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
   /// by referencing its sub-register AX.
   /// Targets must return false when FromTy <= ToTy.
   virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const {
     return false;
   }
 
   /// Return true if a truncation from FromTy to ToTy is permitted when deciding
   /// whether a call is in tail position. Typically this means that both results
   /// would be assigned to the same register or stack slot, but it could mean
   /// the target performs adequate checks of its own before proceeding with the
   /// tail call.  Targets must return false when FromTy <= ToTy.
   virtual bool allowTruncateForTailCall(Type *FromTy, Type *ToTy) const {
     return false;
   }
 
   virtual bool isTruncateFree(EVT FromVT, EVT ToVT) const {
     return false;
   }
 
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }
 
   /// Return true if the extension represented by \p I is free.
   /// Unlikely the is[Z|FP]ExtFree family which is based on types,
   /// this method can use the context provided by \p I to decide
   /// whether or not \p I is free.
   /// This method extends the behavior of the is[Z|FP]ExtFree family.
   /// In other words, if is[Z|FP]Free returns true, then this method
   /// returns true as well. The converse is not true.
   /// The target can perform the adequate checks by overriding isExtFreeImpl.
   /// \pre \p I must be a sign, zero, or fp extension.
   bool isExtFree(const Instruction *I) const {
     switch (I->getOpcode()) {
     case Instruction::FPExt:
       if (isFPExtFree(EVT::getEVT(I->getType()),
                       EVT::getEVT(I->getOperand(0)->getType())))
         return true;
       break;
     case Instruction::ZExt:
       if (isZExtFree(I->getOperand(0)->getType(), I->getType()))
         return true;
       break;
     case Instruction::SExt:
       break;
     default:
       llvm_unreachable("Instruction is not an extension");
     }
     return isExtFreeImpl(I);
   }
 
   /// Return true if \p Load and \p Ext can form an ExtLoad.
   /// For example, in AArch64
   ///   %L = load i8, i8* %ptr
   ///   %E = zext i8 %L to i32
   /// can be lowered into one load instruction
   ///   ldrb w0, [x0]
   bool isExtLoad(const LoadInst *Load, const Instruction *Ext,
                  const DataLayout &DL) const {
     EVT VT = getValueType(DL, Ext->getType());
     EVT LoadVT = getValueType(DL, Load->getType());
 
     // If the load has other users and the truncate is not free, the ext
     // probably isn't free.
     if (!Load->hasOneUse() && (isTypeLegal(LoadVT) || !isTypeLegal(VT)) &&
         !isTruncateFree(Ext->getType(), Load->getType()))
       return false;
 
     // Check whether the target supports casts folded into loads.
     unsigned LType;
     if (isa<ZExtInst>(Ext))
       LType = ISD::ZEXTLOAD;
     else {
       assert(isa<SExtInst>(Ext) && "Unexpected ext type!");
       LType = ISD::SEXTLOAD;
     }
 
     return isLoadExtLegal(LType, VT, LoadVT);
   }
 
   /// Return true if any actual instruction that defines a value of type FromTy
   /// implicitly zero-extends the value to ToTy in the result register.
   ///
   /// The function should return true when it is likely that the truncate can
   /// be freely folded with an instruction defining a value of FromTy. If
   /// the defining instruction is unknown (because you're looking at a
   /// function argument, PHI, etc.) then the target may require an
   /// explicit truncate, which is not necessarily free, but this function
   /// does not deal with those cases.
   /// Targets must return false when FromTy >= ToTy.
   virtual bool isZExtFree(Type *FromTy, Type *ToTy) const {
     return false;
   }
 
   virtual bool isZExtFree(EVT FromTy, EVT ToTy) const {
     return false;
   }
 
   /// Return true if the target supplies and combines to a paired load
   /// two loaded values of type LoadedType next to each other in memory.
   /// RequiredAlignment gives the minimal alignment constraints that must be met
   /// to be able to select this paired load.
   ///
   /// This information is *not* used to generate actual paired loads, but it is
   /// used to generate a sequence of loads that is easier to combine into a
   /// paired load.
   /// For instance, something like this:
   /// a = load i64* addr
   /// b = trunc i64 a to i32
   /// c = lshr i64 a, 32
   /// d = trunc i64 c to i32
   /// will be optimized into:
   /// b = load i32* addr1
   /// d = load i32* addr2
   /// Where addr1 = addr2 +/- sizeof(i32).
   ///
   /// In other words, unless the target performs a post-isel load combining,
   /// this information should not be provided because it will generate more
   /// loads.
   virtual bool hasPairedLoad(EVT /*LoadedType*/,
                              unsigned & /*RequiredAlignment*/) const {
     return false;
   }
 
   /// Return true if the target has a vector blend instruction.
   virtual bool hasVectorBlend() const { return false; }
 
   /// Get the maximum supported factor for interleaved memory accesses.
   /// Default to be the minimum interleave factor: 2.
   virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
 
   /// Lower an interleaved load to target specific intrinsics. Return
   /// true on success.
   ///
   /// \p LI is the vector load instruction.
   /// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
   /// \p Indices is the corresponding indices for each shufflevector.
   /// \p Factor is the interleave factor.
   virtual bool lowerInterleavedLoad(LoadInst *LI,
                                     ArrayRef<ShuffleVectorInst *> Shuffles,
                                     ArrayRef<unsigned> Indices,
                                     unsigned Factor) const {
     return false;
   }
 
   /// Lower an interleaved store to target specific intrinsics. Return
   /// true on success.
   ///
   /// \p SI is the vector store instruction.
   /// \p SVI is the shufflevector to RE-interleave the stored vector.
   /// \p Factor is the interleave factor.
   virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                      unsigned Factor) const {
     return false;
   }
 
   /// Return true if zero-extending the specific node Val to type VT2 is free
   /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
   /// because it's folded such as X86 zero-extending loads).
   virtual bool isZExtFree(SDValue Val, EVT VT2) const {
     return isZExtFree(Val.getValueType(), VT2);
   }
 
   /// Return true if an fpext operation is free (for instance, because
   /// single-precision floating-point numbers are implicitly extended to
   /// double-precision).
   virtual bool isFPExtFree(EVT DestVT, EVT SrcVT) const {
     assert(SrcVT.isFloatingPoint() && DestVT.isFloatingPoint() &&
            "invalid fpext types");
     return false;
   }
 
   /// Return true if an fpext operation input to an \p Opcode operation is free
   /// (for instance, because half-precision floating-point numbers are
   /// implicitly extended to float-precision) for an FMA instruction.
   virtual bool isFPExtFoldable(unsigned Opcode, EVT DestVT, EVT SrcVT) const {
     assert(DestVT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
            "invalid fpext types");
     return isFPExtFree(DestVT, SrcVT);
   }
 
   /// Return true if folding a vector load into ExtVal (a sign, zero, or any
   /// extend node) is profitable.
   virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; }
 
   /// Return true if an fneg operation is free to the point where it is never
   /// worthwhile to replace it with a bitwise operation.
   virtual bool isFNegFree(EVT VT) const {
     assert(VT.isFloatingPoint());
     return false;
   }
 
   /// Return true if an fabs operation is free to the point where it is never
   /// worthwhile to replace it with a bitwise operation.
   virtual bool isFAbsFree(EVT VT) const {
     assert(VT.isFloatingPoint());
     return false;
   }
 
   /// Return true if an FMA operation is faster than a pair of fmul and fadd
   /// instructions. fmuladd intrinsics will be expanded to FMAs when this method
   /// returns true, otherwise fmuladd is expanded to fmul + fadd.
   ///
   /// NOTE: This may be called before legalization on types for which FMAs are
   /// not legal, but should return true if those types will eventually legalize
   /// to types that support FMAs. After legalization, it will only be called on
   /// types that support FMAs (via Legal or Custom actions)
   virtual bool isFMAFasterThanFMulAndFAdd(EVT) const {
     return false;
   }
 
   /// Return true if it's profitable to narrow operations of type VT1 to
   /// VT2. e.g. on x86, it's profitable to narrow from i32 to i8 but not from
   /// i32 to i16.
   virtual bool isNarrowingProfitable(EVT /*VT1*/, EVT /*VT2*/) const {
     return false;
   }
 
   /// Return true if it is beneficial to convert a load of a constant to
   /// just the constant itself.
   /// On some targets it might be more efficient to use a combination of
   /// arithmetic instructions to materialize the constant instead of loading it
   /// from a constant pool.
   virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                  Type *Ty) const {
     return false;
   }
 
   /// Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type
   /// from this source type with this index. This is needed because
   /// EXTRACT_SUBVECTOR usually has custom lowering that depends on the index of
   /// the first element, and only the target knows which lowering is cheap.
   virtual bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                        unsigned Index) const {
     return false;
   }
 
   // Return true if it is profitable to use a scalar input to a BUILD_VECTOR
   // even if the vector itself has multiple uses.
   virtual bool aggressivelyPreferBuildVectorSources(EVT VecVT) const {
     return false;
   }
 
   // Return true if CodeGenPrepare should consider splitting large offset of a
   // GEP to make the GEP fit into the addressing mode and can be sunk into the
   // same blocks of its users.
   virtual bool shouldConsiderGEPOffsetSplit() const { return false; }
 
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //
 
   /// Rename the default libcall routine name for the specified libcall.
   void setLibcallName(RTLIB::Libcall Call, const char *Name) {
     LibcallRoutineNames[Call] = Name;
   }
 
   /// Get the libcall routine name for the specified libcall.
   const char *getLibcallName(RTLIB::Libcall Call) const {
     return LibcallRoutineNames[Call];
   }
 
   /// Override the default CondCode to be used to test the result of the
   /// comparison libcall against zero.
   void setCmpLibcallCC(RTLIB::Libcall Call, ISD::CondCode CC) {
     CmpLibcallCCs[Call] = CC;
   }
 
   /// Get the CondCode that's to be used to test the result of the comparison
   /// libcall against zero.
   ISD::CondCode getCmpLibcallCC(RTLIB::Libcall Call) const {
     return CmpLibcallCCs[Call];
   }
 
   /// Set the CallingConv that should be used for the specified libcall.
   void setLibcallCallingConv(RTLIB::Libcall Call, CallingConv::ID CC) {
     LibcallCallingConvs[Call] = CC;
   }
 
   /// Get the CallingConv that should be used for the specified libcall.
   CallingConv::ID getLibcallCallingConv(RTLIB::Libcall Call) const {
     return LibcallCallingConvs[Call];
   }
 
   /// Execute target specific actions to finalize target lowering.
   /// This is used to set extra flags in MachineFrameInformation and freezing
   /// the set of reserved registers.
   /// The default implementation just freezes the set of reserved registers.
   virtual void finalizeLowering(MachineFunction &MF) const;
 
 private:
   const TargetMachine &TM;
 
   /// Tells the code generator that the target has multiple (allocatable)
   /// condition registers that can be used to store the results of comparisons
   /// for use by selects and conditional branches. With multiple condition
   /// registers, the code generator will not aggressively sink comparisons into
   /// the blocks of their users.
   bool HasMultipleConditionRegisters;
 
   /// Tells the code generator that the target has BitExtract instructions.
   /// The code generator will aggressively sink "shift"s into the blocks of
   /// their users if the users will generate "and" instructions which can be
   /// combined with "shift" to BitExtract instructions.
   bool HasExtractBitsInsn;
 
   /// Tells the code generator to bypass slow divide or remainder
   /// instructions. For example, BypassSlowDivWidths[32,8] tells the code
   /// generator to bypass 32-bit integer div/rem with an 8-bit unsigned integer
   /// div/rem when the operands are positive and less than 256.
   DenseMap <unsigned int, unsigned int> BypassSlowDivWidths;
 
   /// Tells the code generator that it shouldn't generate extra flow control
   /// instructions and should attempt to combine flow control instructions via
   /// predication.
   bool JumpIsExpensive;
 
   /// Whether the target supports or cares about preserving floating point
   /// exception behavior.
   bool HasFloatingPointExceptions;
 
   /// This target prefers to use _setjmp to implement llvm.setjmp.
   ///
   /// Defaults to false.
   bool UseUnderscoreSetJmp;
 
   /// This target prefers to use _longjmp to implement llvm.longjmp.
   ///
   /// Defaults to false.
   bool UseUnderscoreLongJmp;
 
   /// Information about the contents of the high-bits in boolean values held in
   /// a type wider than i1. See getBooleanContents.
   BooleanContent BooleanContents;
 
   /// Information about the contents of the high-bits in boolean values held in
   /// a type wider than i1. See getBooleanContents.
   BooleanContent BooleanFloatContents;
 
   /// Information about the contents of the high-bits in boolean vector values
   /// when the element type is wider than i1. See getBooleanContents.
   BooleanContent BooleanVectorContents;
 
   /// The target scheduling preference: shortest possible total cycles or lowest
   /// register usage.
   Sched::Preference SchedPreferenceInfo;
 
   /// The size, in bytes, of the target's jmp_buf buffers
   unsigned JumpBufSize;
 
   /// The alignment, in bytes, of the target's jmp_buf buffers
   unsigned JumpBufAlignment;
 
   /// The minimum alignment that any argument on the stack needs to have.
   unsigned MinStackArgumentAlignment;
 
   /// The minimum function alignment (used when optimizing for size, and to
   /// prevent explicitly provided alignment from leading to incorrect code).
   unsigned MinFunctionAlignment;
 
   /// The preferred function alignment (used when alignment unspecified and
   /// optimizing for speed).
   unsigned PrefFunctionAlignment;
 
   /// The preferred loop alignment.
   unsigned PrefLoopAlignment;
 
   /// Size in bits of the maximum atomics size the backend supports.
   /// Accesses larger than this will be expanded by AtomicExpandPass.
   unsigned MaxAtomicSizeInBitsSupported;
 
   /// Size in bits of the minimum cmpxchg or ll/sc operation the
   /// backend supports.
   unsigned MinCmpXchgSizeInBits;
 
   /// This indicates if the target supports unaligned atomic operations.
   bool SupportsUnalignedAtomics;
 
   /// If set to a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
   unsigned StackPointerRegisterToSaveRestore;
 
   /// This indicates the default register class to use for each ValueType the
   /// target supports natively.
   const TargetRegisterClass *RegClassForVT[MVT::LAST_VALUETYPE];
   unsigned char NumRegistersForVT[MVT::LAST_VALUETYPE];
   MVT RegisterTypeForVT[MVT::LAST_VALUETYPE];
 
   /// This indicates the "representative" register class to use for each
   /// ValueType the target supports natively. This information is used by the
   /// scheduler to track register pressure. By default, the representative
   /// register class is the largest legal super-reg register class of the
   /// register class of the specified type. e.g. On x86, i8, i16, and i32's
   /// representative class would be GR32.
   const TargetRegisterClass *RepRegClassForVT[MVT::LAST_VALUETYPE];
 
   /// This indicates the "cost" of the "representative" register class for each
   /// ValueType. The cost is used by the scheduler to approximate register
   /// pressure.
   uint8_t RepRegClassCostForVT[MVT::LAST_VALUETYPE];
 
   /// For any value types we are promoting or expanding, this contains the value
   /// type that we are changing to.  For Expanded types, this contains one step
   /// of the expand (e.g. i64 -> i32), even if there are multiple steps required
   /// (e.g. i64 -> i16).  For types natively supported by the system, this holds
   /// the same type (e.g. i32 -> i32).
   MVT TransformToType[MVT::LAST_VALUETYPE];
 
   /// For each operation and each value type, keep a LegalizeAction that
   /// indicates how instruction selection should deal with the operation.  Most
   /// operations are Legal (aka, supported natively by the target), but
   /// operations that are not should be described.  Note that operations on
   /// non-legal value types are not described here.
   LegalizeAction OpActions[MVT::LAST_VALUETYPE][ISD::BUILTIN_OP_END];
 
   /// For each load extension type and each value type, keep a LegalizeAction
   /// that indicates how instruction selection should deal with a load of a
   /// specific value type and extension type. Uses 4-bits to store the action
   /// for each of the 4 load ext types.
   uint16_t LoadExtActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];
 
   /// For each value type pair keep a LegalizeAction that indicates whether a
   /// truncating store of a specific value type and truncating type is legal.
   LegalizeAction TruncStoreActions[MVT::LAST_VALUETYPE][MVT::LAST_VALUETYPE];
 
   /// For each indexed mode and each value type, keep a pair of LegalizeAction
   /// that indicates how instruction selection should deal with the load /
   /// store.
   ///
   /// The first dimension is the value_type for the reference. The second
   /// dimension represents the various modes for load store.
   uint8_t IndexedModeActions[MVT::LAST_VALUETYPE][ISD::LAST_INDEXED_MODE];
 
   /// For each condition code (ISD::CondCode) keep a LegalizeAction that
   /// indicates how instruction selection should deal with the condition code.
   ///
   /// Because each CC action takes up 4 bits, we need to have the array size be
   /// large enough to fit all of the value types. This can be done by rounding
   /// up the MVT::LAST_VALUETYPE value to the next multiple of 8.
   uint32_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE + 7) / 8];
 
 protected:
   ValueTypeActionImpl ValueTypeActions;
 
 private:
   LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const;
 
   /// Targets can specify ISD nodes that they would like PerformDAGCombine
   /// callbacks for by calling setTargetDAGCombine(), which sets a bit in this
   /// array.
   unsigned char
   TargetDAGCombineArray[(ISD::BUILTIN_OP_END+CHAR_BIT-1)/CHAR_BIT];
 
   /// For operations that must be promoted to a specific type, this holds the
   /// destination type.  This map should be sparse, so don't hold it as an
   /// array.
   ///
   /// Targets add entries to this map with AddPromotedToType(..), clients access
   /// this with getTypeToPromoteTo(..).
   std::map<std::pair<unsigned, MVT::SimpleValueType>, MVT::SimpleValueType>
     PromoteToType;
 
   /// Stores the name each libcall.
   const char *LibcallRoutineNames[RTLIB::UNKNOWN_LIBCALL + 1];
 
   /// The ISD::CondCode that should be used to test the result of each of the
   /// comparison libcall against zero.
   ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL];
 
   /// Stores the CallingConv that should be used for each libcall.
   CallingConv::ID LibcallCallingConvs[RTLIB::UNKNOWN_LIBCALL];
 
   /// Set default libcall names and calling conventions.
   void InitLibcalls(const Triple &TT);
 
 protected:
   /// Return true if the extension represented by \p I is free.
   /// \pre \p I is a sign, zero, or fp extension and
   ///      is[Z|FP]ExtFree of the related types is not true.
   virtual bool isExtFreeImpl(const Instruction *I) const { return false; }
 
   /// Depth that GatherAllAliases should should continue looking for chain
   /// dependencies when trying to find a more preferable chain. As an
   /// approximation, this should be more than the number of consecutive stores
   /// expected to be merged.
   unsigned GatherAllAliasesMaxDepth;
 
   /// Specify maximum number of store instructions per memset call.
   ///
   /// When lowering \@llvm.memset this field specifies the maximum number of
   /// store operations that may be substituted for the call to memset. Targets
   /// must set this value based on the cost threshold for that target. Targets
   /// should assume that the memset will be done using as many of the largest
   /// store operations first, followed by smaller ones, if necessary, per
   /// alignment restrictions. For example, storing 9 bytes on a 32-bit machine
   /// with 16-bit alignment would result in four 2-byte stores and one 1-byte
   /// store.  This only applies to setting a constant array of a constant size.
   unsigned MaxStoresPerMemset;
 
   /// Maximum number of stores operations that may be substituted for the call
   /// to memset, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemsetOptSize;
 
   /// Specify maximum bytes of store instructions per memcpy call.
   ///
   /// When lowering \@llvm.memcpy this field specifies the maximum number of
   /// store operations that may be substituted for a call to memcpy. Targets
   /// must set this value based on the cost threshold for that target. Targets
   /// should assume that the memcpy will be done using as many of the largest
   /// store operations first, followed by smaller ones, if necessary, per
   /// alignment restrictions. For example, storing 7 bytes on a 32-bit machine
   /// with 32-bit alignment would result in one 4-byte store, a one 2-byte store
   /// and one 1-byte store. This only applies to copying a constant array of
   /// constant size.
   unsigned MaxStoresPerMemcpy;
 
 
   /// \brief Specify max number of store instructions to glue in inlined memcpy.
   ///
   /// When memcpy is inlined based on MaxStoresPerMemcpy, specify maximum number
   /// of store instructions to keep together. This helps in pairing and
   //  vectorization later on.
   unsigned MaxGluedStoresPerMemcpy = 0;
 
   /// Maximum number of store operations that may be substituted for a call to
   /// memcpy, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemcpyOptSize;
   unsigned MaxLoadsPerMemcmp;
   unsigned MaxLoadsPerMemcmpOptSize;
 
   /// Specify maximum bytes of store instructions per memmove call.
   ///
   /// When lowering \@llvm.memmove this field specifies the maximum number of
   /// store instructions that may be substituted for a call to memmove. Targets
   /// must set this value based on the cost threshold for that target. Targets
   /// should assume that the memmove will be done using as many of the largest
   /// store operations first, followed by smaller ones, if necessary, per
   /// alignment restrictions. For example, moving 9 bytes on a 32-bit machine
   /// with 8-bit alignment would result in nine 1-byte stores.  This only
   /// applies to copying a constant array of constant size.
   unsigned MaxStoresPerMemmove;
 
   /// Maximum number of store instructions that may be substituted for a call to
   /// memmove, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemmoveOptSize;
 
   /// Tells the code generator that select is more expensive than a branch if
   /// the branch is usually predicted right.
   bool PredictableSelectIsExpensive;
 
   /// \see enableExtLdPromotion.
   bool EnableExtLdPromotion;
 
   /// Return true if the value types that can be represented by the specified
   /// register class are all legal.
   bool isLegalRC(const TargetRegisterInfo &TRI,
                  const TargetRegisterClass &RC) const;
 
   /// Replace/modify any TargetFrameIndex operands with a targte-dependent
   /// sequence of memory operands that is recognized by PrologEpilogInserter.
   MachineBasicBlock *emitPatchPoint(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const;
 
   /// Replace/modify the XRay custom event operands with target-dependent
   /// details.
   MachineBasicBlock *emitXRayCustomEvent(MachineInstr &MI,
                                          MachineBasicBlock *MBB) const;
 
   /// Replace/modify the XRay typed event operands with target-dependent
   /// details.
   MachineBasicBlock *emitXRayTypedEvent(MachineInstr &MI,
                                         MachineBasicBlock *MBB) const;
 };
 
 /// This class defines information used to lower LLVM code to legal SelectionDAG
 /// operators that the target instruction selector can accept natively.
 ///
 /// This class also defines callbacks that targets must implement to lower
 /// target-specific constructs to SelectionDAG operators.
 class TargetLowering : public TargetLoweringBase {
 public:
   struct DAGCombinerInfo;
 
   TargetLowering(const TargetLowering &) = delete;
   TargetLowering &operator=(const TargetLowering &) = delete;
 
   /// NOTE: The TargetMachine owns TLOF.
   explicit TargetLowering(const TargetMachine &TM);
 
   bool isPositionIndependent() const;
 
   virtual bool isSDNodeSourceOfDivergence(const SDNode *N,
                                           FunctionLoweringInfo *FLI,
                                           DivergenceAnalysis *DA) const {
     return false;
   }
 
   virtual bool isSDNodeAlwaysUniform(const SDNode * N) const {
     return false;
   }
 
   /// Returns true by value, base pointer and offset pointer and addressing mode
   /// by reference if the node's address can be legally represented as
   /// pre-indexed load / store address.
   virtual bool getPreIndexedAddressParts(SDNode * /*N*/, SDValue &/*Base*/,
                                          SDValue &/*Offset*/,
                                          ISD::MemIndexedMode &/*AM*/,
                                          SelectionDAG &/*DAG*/) const {
     return false;
   }
 
   /// Returns true by value, base pointer and offset pointer and addressing mode
   /// by reference if this node can be combined with a load / store to form a
   /// post-indexed load / store.
   virtual bool getPostIndexedAddressParts(SDNode * /*N*/, SDNode * /*Op*/,
                                           SDValue &/*Base*/,
                                           SDValue &/*Offset*/,
                                           ISD::MemIndexedMode &/*AM*/,
                                           SelectionDAG &/*DAG*/) const {
     return false;
   }
 
   /// Return the entry encoding for a jump table in the current function.  The
   /// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
   virtual unsigned getJumpTableEncoding() const;
 
   virtual const MCExpr *
   LowerCustomJumpTableEntry(const MachineJumpTableInfo * /*MJTI*/,
                             const MachineBasicBlock * /*MBB*/, unsigned /*uid*/,
                             MCContext &/*Ctx*/) const {
     llvm_unreachable("Need to implement this hook if target has custom JTIs");
   }
 
   /// Returns relocation base for the given PIC jumptable.
   virtual SDValue getPICJumpTableRelocBase(SDValue Table,
                                            SelectionDAG &DAG) const;
 
   /// This returns the relocation base for the given PIC jumptable, the same as
   /// getPICJumpTableRelocBase, but as an MCExpr.
   virtual const MCExpr *
   getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
                                unsigned JTI, MCContext &Ctx) const;
 
   /// Return true if folding a constant offset with the given GlobalAddress is
   /// legal.  It is frequently not legal in PIC relocation models.
   virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
   bool isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
                             SDValue &Chain) const;
 
   void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS,
                            SDValue &NewRHS, ISD::CondCode &CCCode,
                            const SDLoc &DL) const;
 
   /// Returns a pair of (return value, chain).
   /// It is an error to pass RTLIB::UNKNOWN_LIBCALL as \p LC.
   std::pair<SDValue, SDValue> makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC,
                                           EVT RetVT, ArrayRef<SDValue> Ops,
                                           bool isSigned, const SDLoc &dl,
                                           bool doesNotReturn = false,
                                           bool isReturnValueUsed = true) const;
 
   /// Check whether parameters to a call that are passed in callee saved
   /// registers are the same as from the calling function.  This needs to be
   /// checked for tail call eligibility.
   bool parametersInCSRMatch(const MachineRegisterInfo &MRI,
       const uint32_t *CallerPreservedMask,
       const SmallVectorImpl<CCValAssign> &ArgLocs,
       const SmallVectorImpl<SDValue> &OutVals) const;
 
   //===--------------------------------------------------------------------===//
   // TargetLowering Optimization Methods
   //
 
   /// A convenience struct that encapsulates a DAG, and two SDValues for
   /// returning information from TargetLowering to its clients that want to
   /// combine.
   struct TargetLoweringOpt {
     SelectionDAG &DAG;
     bool LegalTys;
     bool LegalOps;
     SDValue Old;
     SDValue New;
 
     explicit TargetLoweringOpt(SelectionDAG &InDAG,
                                bool LT, bool LO) :
       DAG(InDAG), LegalTys(LT), LegalOps(LO) {}
 
     bool LegalTypes() const { return LegalTys; }
     bool LegalOperations() const { return LegalOps; }
 
     bool CombineTo(SDValue O, SDValue N) {
       Old = O;
       New = N;
       return true;
     }
   };
 
   /// Check to see if the specified operand of the specified instruction is a
   /// constant integer.  If so, check to see if there are any bits set in the
   /// constant that are not demanded.  If so, shrink the constant and return
   /// true.
   bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
                               TargetLoweringOpt &TLO) const;
 
   // Target hook to do target-specific const optimization, which is called by
   // ShrinkDemandedConstant. This function should return true if the target
   // doesn't want ShrinkDemandedConstant to further optimize the constant.
   virtual bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
                                             TargetLoweringOpt &TLO) const {
     return false;
   }
 
   /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.  This
   /// uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
   /// generalized for targets with other types of implicit widening casts.
   bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
                         TargetLoweringOpt &TLO) const;
 
   /// Helper for SimplifyDemandedBits that can simplify an operation with
   /// multiple uses.  This function simplifies operand \p OpIdx of \p User and
   /// then updates \p User with the simplified version. No other uses of
   /// \p OpIdx are updated. If \p User is the only user of \p OpIdx, this
   /// function behaves exactly like function SimplifyDemandedBits declared
   /// below except that it also updates the DAG by calling
   /// DCI.CommitTargetLoweringOpt.
   bool SimplifyDemandedBits(SDNode *User, unsigned OpIdx, const APInt &Demanded,
                             DAGCombinerInfo &DCI, TargetLoweringOpt &TLO) const;
 
   /// Look at Op.  At this point, we know that only the DemandedMask bits of the
   /// result of Op are ever used downstream.  If we can use this information to
   /// simplify Op, create a new simplified DAG node and return true, returning
   /// the original and new nodes in Old and New.  Otherwise, analyze the
   /// expression and return a mask of KnownOne and KnownZero bits for the
   /// expression (used to simplify the caller).  The KnownZero/One bits may only
   /// be accurate for those bits in the DemandedMask.
   /// \p AssumeSingleUse When this parameter is true, this function will
   ///    attempt to simplify \p Op even if there are multiple uses.
   ///    Callers are responsible for correctly updating the DAG based on the
   ///    results of this function, because simply replacing replacing TLO.Old
   ///    with TLO.New will be incorrect when this parameter is true and TLO.Old
   ///    has multiple uses.
   bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
                             KnownBits &Known,
                             TargetLoweringOpt &TLO,
                             unsigned Depth = 0,
                             bool AssumeSingleUse = false) const;
 
   /// Helper wrapper around SimplifyDemandedBits
   bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
                             DAGCombinerInfo &DCI) const;
 
   /// Look at Vector Op. At this point, we know that only the DemandedElts
   /// elements of the result of Op are ever used downstream.  If we can use
   /// this information to simplify Op, create a new simplified DAG node and
   /// return true, storing the original and new nodes in TLO.
   /// Otherwise, analyze the expression and return a mask of KnownUndef and
   /// KnownZero elements for the expression (used to simplify the caller).
   /// The KnownUndef/Zero elements may only be accurate for those bits
   /// in the DemandedMask.
   /// \p AssumeSingleUse When this parameter is true, this function will
   ///    attempt to simplify \p Op even if there are multiple uses.
   ///    Callers are responsible for correctly updating the DAG based on the
   ///    results of this function, because simply replacing replacing TLO.Old
   ///    with TLO.New will be incorrect when this parameter is true and TLO.Old
   ///    has multiple uses.
   bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask,
                                   APInt &KnownUndef, APInt &KnownZero,
                                   TargetLoweringOpt &TLO, unsigned Depth = 0,
                                   bool AssumeSingleUse = false) const;
 
   /// Helper wrapper around SimplifyDemandedVectorElts
   bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
                                   APInt &KnownUndef, APInt &KnownZero,
                                   DAGCombinerInfo &DCI) const;
 
   /// Determine which of the bits specified in Mask are known to be either zero
   /// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
   /// argument allows us to only collect the known bits that are shared by the
   /// requested vector elements.
   virtual void computeKnownBitsForTargetNode(const SDValue Op,
                                              KnownBits &Known,
                                              const APInt &DemandedElts,
                                              const SelectionDAG &DAG,
                                              unsigned Depth = 0) const;
 
   /// Determine which of the bits of FrameIndex \p FIOp are known to be 0.
   /// Default implementation computes low bits based on alignment
   /// information. This should preserve known bits passed into it.
   virtual void computeKnownBitsForFrameIndex(const SDValue FIOp,
                                              KnownBits &Known,
                                              const APInt &DemandedElts,
                                              const SelectionDAG &DAG,
                                              unsigned Depth = 0) const;
 
   /// This method can be implemented by targets that want to expose additional
   /// information about sign bits to the DAG Combiner. The DemandedElts
   /// argument allows us to only collect the minimum sign bits that are shared
   /// by the requested vector elements.
   virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
                                                    const APInt &DemandedElts,
                                                    const SelectionDAG &DAG,
                                                    unsigned Depth = 0) const;
 
   /// Attempt to simplify any target nodes based on the demanded vector
   /// elements, returning true on success. Otherwise, analyze the expression and
   /// return a mask of KnownUndef and KnownZero elements for the expression
   /// (used to simplify the caller). The KnownUndef/Zero elements may only be
   /// accurate for those bits in the DemandedMask
   virtual bool SimplifyDemandedVectorEltsForTargetNode(
       SDValue Op, const APInt &DemandedElts, APInt &KnownUndef,
       APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth = 0) const;
 
   struct DAGCombinerInfo {
     void *DC;  // The DAG Combiner object.
     CombineLevel Level;
     bool CalledByLegalizer;
 
   public:
     SelectionDAG &DAG;
 
     DAGCombinerInfo(SelectionDAG &dag, CombineLevel level,  bool cl, void *dc)
       : DC(dc), Level(level), CalledByLegalizer(cl), DAG(dag) {}
 
     bool isBeforeLegalize() const { return Level == BeforeLegalizeTypes; }
     bool isBeforeLegalizeOps() const { return Level < AfterLegalizeVectorOps; }
     bool isAfterLegalizeDAG() const {
       return Level == AfterLegalizeDAG;
     }
     CombineLevel getDAGCombineLevel() { return Level; }
     bool isCalledByLegalizer() const { return CalledByLegalizer; }
 
     void AddToWorklist(SDNode *N);
     SDValue CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo = true);
     SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true);
     SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo = true);
 
     void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO);
   };
 
   /// Return if the N is a constant or constant vector equal to the true value
   /// from getBooleanContents().
   bool isConstTrueVal(const SDNode *N) const;
 
   /// Return if the N is a constant or constant vector equal to the false value
   /// from getBooleanContents().
   bool isConstFalseVal(const SDNode *N) const;
 
   /// Return if \p N is a True value when extended to \p VT.
   bool isExtendedTrueVal(const ConstantSDNode *N, EVT VT, bool SExt) const;
 
   /// Try to simplify a setcc built with the specified operands and cc. If it is
   /// unable to simplify it, return a null SDValue.
   SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
                         bool foldBooleans, DAGCombinerInfo &DCI,
                         const SDLoc &dl) const;
 
   // For targets which wrap address, unwrap for analysis.
   virtual SDValue unwrapAddress(SDValue N) const { return N; }
 
   /// Returns true (and the GlobalValue and the offset) if the node is a
   /// GlobalAddress + offset.
   virtual bool
   isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const;
 
   /// This method will be invoked for all target nodes and for any
   /// target-independent nodes that the target has registered with invoke it
   /// for.
   ///
   /// The semantics are as follows:
   /// Return Value:
   ///   SDValue.Val == 0   - No change was made
   ///   SDValue.Val == N   - N was replaced, is dead, and is already handled.
   ///   otherwise          - N should be replaced by the returned Operand.
   ///
   /// In addition, methods provided by DAGCombinerInfo may be used to perform
   /// more complex transformations.
   ///
   virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
-  /// Return true if it is profitable to move a following shift through this
-  //  node, adjusting any immediate operands as necessary to preserve semantics.
-  //  This transformation may not be desirable if it disrupts a particularly
-  //  auspicious target-specific tree (e.g. bitfield extraction in AArch64).
-  //  By default, it returns true.
-  virtual bool isDesirableToCommuteWithShift(const SDNode *N) const {
+  /// Return true if it is profitable to move this shift by a constant amount
+  /// though its operand, adjusting any immediate operands as necessary to
+  /// preserve semantics. This transformation may not be desirable if it
+  /// disrupts a particularly auspicious target-specific tree (e.g. bitfield
+  /// extraction in AArch64). By default, it returns true.
+  ///
+  /// @param N the shift node
+  /// @param Level the current DAGCombine legalization level.
+  virtual bool isDesirableToCommuteWithShift(const SDNode *N,
+                                             CombineLevel Level) const {
     return true;
   }
 
   // Return true if it is profitable to combine a BUILD_VECTOR with a stride-pattern
   // to a shuffle and a truncate.
   // Example of such a combine:
   // v4i32 build_vector((extract_elt V, 1),
   //                    (extract_elt V, 3),
   //                    (extract_elt V, 5),
   //                    (extract_elt V, 7))
   //  -->
   // v4i32 truncate (bitcast (shuffle<1,u,3,u,5,u,7,u> V, u) to v4i64)
   virtual bool isDesirableToCombineBuildVectorToShuffleTruncate(
       ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
     return false;
   }
 
   /// Return true if the target has native support for the specified value type
   /// and it is 'desirable' to use the type for the given node type. e.g. On x86
   /// i16 is legal, but undesirable since i16 instruction encodings are longer
   /// and some i16 instructions are slow.
   virtual bool isTypeDesirableForOp(unsigned /*Opc*/, EVT VT) const {
     // By default, assume all legal types are desirable.
     return isTypeLegal(VT);
   }
 
   /// Return true if it is profitable for dag combiner to transform a floating
   /// point op of specified opcode to a equivalent op of an integer
   /// type. e.g. f32 load -> i32 load can be profitable on ARM.
   virtual bool isDesirableToTransformToIntegerOp(unsigned /*Opc*/,
                                                  EVT /*VT*/) const {
     return false;
   }
 
   /// This method query the target whether it is beneficial for dag combiner to
   /// promote the specified node. If true, it should return the desired
   /// promotion type by reference.
   virtual bool IsDesirableToPromoteOp(SDValue /*Op*/, EVT &/*PVT*/) const {
     return false;
   }
 
   /// Return true if the target supports swifterror attribute. It optimizes
   /// loads and stores to reading and writing a specific register.
   virtual bool supportSwiftError() const {
     return false;
   }
 
   /// Return true if the target supports that a subset of CSRs for the given
   /// machine function is handled explicitly via copies.
   virtual bool supportSplitCSR(MachineFunction *MF) const {
     return false;
   }
 
   /// Perform necessary initialization to handle a subset of CSRs explicitly
   /// via copies. This function is called at the beginning of instruction
   /// selection.
   virtual void initializeSplitCSR(MachineBasicBlock *Entry) const {
     llvm_unreachable("Not Implemented");
   }
 
   /// Insert explicit copies in entry and exit blocks. We copy a subset of
   /// CSRs to virtual registers in the entry block, and copy them back to
   /// physical registers in the exit blocks. This function is called at the end
   /// of instruction selection.
   virtual void insertCopiesSplitCSR(
       MachineBasicBlock *Entry,
       const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
     llvm_unreachable("Not Implemented");
   }
 
   //===--------------------------------------------------------------------===//
   // Lowering methods - These methods must be implemented by targets so that
   // the SelectionDAGBuilder code knows how to lower these.
   //
 
   /// This hook must be implemented to lower the incoming (formal) arguments,
   /// described by the Ins array, into the specified DAG. The implementation
   /// should fill in the InVals array with legal-type argument values, and
   /// return the resulting token chain value.
   virtual SDValue LowerFormalArguments(
       SDValue /*Chain*/, CallingConv::ID /*CallConv*/, bool /*isVarArg*/,
       const SmallVectorImpl<ISD::InputArg> & /*Ins*/, const SDLoc & /*dl*/,
       SelectionDAG & /*DAG*/, SmallVectorImpl<SDValue> & /*InVals*/) const {
     llvm_unreachable("Not Implemented");
   }
 
   /// This structure contains all information that is necessary for lowering
   /// calls. It is passed to TLI::LowerCallTo when the SelectionDAG builder
   /// needs to lower a call, and targets will see this struct in their LowerCall
   /// implementation.
   struct CallLoweringInfo {
     SDValue Chain;
     Type *RetTy = nullptr;
     bool RetSExt           : 1;
     bool RetZExt           : 1;
     bool IsVarArg          : 1;
     bool IsInReg           : 1;
     bool DoesNotReturn     : 1;
     bool IsReturnValueUsed : 1;
     bool IsConvergent      : 1;
     bool IsPatchPoint      : 1;
 
     // IsTailCall should be modified by implementations of
     // TargetLowering::LowerCall that perform tail call conversions.
     bool IsTailCall = false;
 
     // Is Call lowering done post SelectionDAG type legalization.
     bool IsPostTypeLegalization = false;
 
     unsigned NumFixedArgs = -1;
     CallingConv::ID CallConv = CallingConv::C;
     SDValue Callee;
     ArgListTy Args;
     SelectionDAG &DAG;
     SDLoc DL;
     ImmutableCallSite CS;
     SmallVector<ISD::OutputArg, 32> Outs;
     SmallVector<SDValue, 32> OutVals;
     SmallVector<ISD::InputArg, 32> Ins;
     SmallVector<SDValue, 4> InVals;
 
     CallLoweringInfo(SelectionDAG &DAG)
         : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false),
           DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false),
           IsPatchPoint(false), DAG(DAG) {}
 
     CallLoweringInfo &setDebugLoc(const SDLoc &dl) {
       DL = dl;
       return *this;
     }
 
     CallLoweringInfo &setChain(SDValue InChain) {
       Chain = InChain;
       return *this;
     }
 
     // setCallee with target/module-specific attributes
     CallLoweringInfo &setLibCallee(CallingConv::ID CC, Type *ResultType,
                                    SDValue Target, ArgListTy &&ArgsList) {
       RetTy = ResultType;
       Callee = Target;
       CallConv = CC;
       NumFixedArgs = ArgsList.size();
       Args = std::move(ArgsList);
 
       DAG.getTargetLoweringInfo().markLibCallAttributes(
           &(DAG.getMachineFunction()), CC, Args);
       return *this;
     }
 
     CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultType,
                                 SDValue Target, ArgListTy &&ArgsList) {
       RetTy = ResultType;
       Callee = Target;
       CallConv = CC;
       NumFixedArgs = ArgsList.size();
       Args = std::move(ArgsList);
       return *this;
     }
 
     CallLoweringInfo &setCallee(Type *ResultType, FunctionType *FTy,
                                 SDValue Target, ArgListTy &&ArgsList,
                                 ImmutableCallSite Call) {
       RetTy = ResultType;
 
       IsInReg = Call.hasRetAttr(Attribute::InReg);
       DoesNotReturn =
           Call.doesNotReturn() ||
           (!Call.isInvoke() &&
            isa<UnreachableInst>(Call.getInstruction()->getNextNode()));
       IsVarArg = FTy->isVarArg();
       IsReturnValueUsed = !Call.getInstruction()->use_empty();
       RetSExt = Call.hasRetAttr(Attribute::SExt);
       RetZExt = Call.hasRetAttr(Attribute::ZExt);
 
       Callee = Target;
 
       CallConv = Call.getCallingConv();
       NumFixedArgs = FTy->getNumParams();
       Args = std::move(ArgsList);
 
       CS = Call;
 
       return *this;
     }
 
     CallLoweringInfo &setInRegister(bool Value = true) {
       IsInReg = Value;
       return *this;
     }
 
     CallLoweringInfo &setNoReturn(bool Value = true) {
       DoesNotReturn = Value;
       return *this;
     }
 
     CallLoweringInfo &setVarArg(bool Value = true) {
       IsVarArg = Value;
       return *this;
     }
 
     CallLoweringInfo &setTailCall(bool Value = true) {
       IsTailCall = Value;
       return *this;
     }
 
     CallLoweringInfo &setDiscardResult(bool Value = true) {
       IsReturnValueUsed = !Value;
       return *this;
     }
 
     CallLoweringInfo &setConvergent(bool Value = true) {
       IsConvergent = Value;
       return *this;
     }
 
     CallLoweringInfo &setSExtResult(bool Value = true) {
       RetSExt = Value;
       return *this;
     }
 
     CallLoweringInfo &setZExtResult(bool Value = true) {
       RetZExt = Value;
       return *this;
     }
 
     CallLoweringInfo &setIsPatchPoint(bool Value = true) {
       IsPatchPoint = Value;
       return *this;
     }
 
     CallLoweringInfo &setIsPostTypeLegalization(bool Value=true) {
       IsPostTypeLegalization = Value;
       return *this;
     }
 
     ArgListTy &getArgs() {
       return Args;
     }
   };
 
   /// This function lowers an abstract call to a function into an actual call.
   /// This returns a pair of operands.  The first element is the return value
   /// for the function (if RetTy is not VoidTy).  The second element is the
   /// outgoing token chain. It calls LowerCall to do the actual lowering.
   std::pair<SDValue, SDValue> LowerCallTo(CallLoweringInfo &CLI) const;
 
   /// This hook must be implemented to lower calls into the specified
   /// DAG. The outgoing arguments to the call are described by the Outs array,
   /// and the values to be returned by the call are described by the Ins
   /// array. The implementation should fill in the InVals array with legal-type
   /// return values from the call, and return the resulting token chain value.
   virtual SDValue
     LowerCall(CallLoweringInfo &/*CLI*/,
               SmallVectorImpl<SDValue> &/*InVals*/) const {
     llvm_unreachable("Not Implemented");
   }
 
   /// Target-specific cleanup for formal ByVal parameters.
   virtual void HandleByVal(CCState *, unsigned &, unsigned) const {}
 
   /// This hook should be implemented to check whether the return values
   /// described by the Outs array can fit into the return registers.  If false
   /// is returned, an sret-demotion is performed.
   virtual bool CanLowerReturn(CallingConv::ID /*CallConv*/,
                               MachineFunction &/*MF*/, bool /*isVarArg*/,
                const SmallVectorImpl<ISD::OutputArg> &/*Outs*/,
                LLVMContext &/*Context*/) const
   {
     // Return true by default to get preexisting behavior.
     return true;
   }
 
   /// This hook must be implemented to lower outgoing return values, described
   /// by the Outs array, into the specified DAG. The implementation should
   /// return the resulting token chain value.
   virtual SDValue LowerReturn(SDValue /*Chain*/, CallingConv::ID /*CallConv*/,
                               bool /*isVarArg*/,
                               const SmallVectorImpl<ISD::OutputArg> & /*Outs*/,
                               const SmallVectorImpl<SDValue> & /*OutVals*/,
                               const SDLoc & /*dl*/,
                               SelectionDAG & /*DAG*/) const {
     llvm_unreachable("Not Implemented");
   }
 
   /// Return true if result of the specified node is used by a return node
   /// only. It also compute and return the input chain for the tail call.
   ///
   /// This is used to determine whether it is possible to codegen a libcall as
   /// tail call at legalization time.
   virtual bool isUsedByReturnOnly(SDNode *, SDValue &/*Chain*/) const {
     return false;
   }
 
   /// Return true if the target may be able emit the call instruction as a tail
   /// call. This is used by optimization passes to determine if it's profitable
   /// to duplicate return instructions to enable tailcall optimization.
   virtual bool mayBeEmittedAsTailCall(const CallInst *) const {
     return false;
   }
 
   /// Return the builtin name for the __builtin___clear_cache intrinsic
   /// Default is to invoke the clear cache library call
   virtual const char * getClearCacheBuiltinName() const {
     return "__clear_cache";
   }
 
   /// Return the register ID of the name passed in. Used by named register
   /// global variables extension. There is no target-independent behaviour
   /// so the default action is to bail.
   virtual unsigned getRegisterByName(const char* RegName, EVT VT,
                                      SelectionDAG &DAG) const {
     report_fatal_error("Named registers not implemented for this target");
   }
 
   /// Return the type that should be used to zero or sign extend a
   /// zeroext/signext integer return value.  FIXME: Some C calling conventions
   /// require the return type to be promoted, but this is not true all the time,
   /// e.g. i1/i8/i16 on x86/x86_64. It is also not necessary for non-C calling
   /// conventions. The frontend should handle this and include all of the
   /// necessary information.
   virtual EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
                                        ISD::NodeType /*ExtendKind*/) const {
     EVT MinVT = getRegisterType(Context, MVT::i32);
     return VT.bitsLT(MinVT) ? MinVT : VT;
   }
 
   /// For some targets, an LLVM struct type must be broken down into multiple
   /// simple types, but the calling convention specifies that the entire struct
   /// must be passed in a block of consecutive registers.
   virtual bool
   functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv,
                                             bool isVarArg) const {
     return false;
   }
 
   /// Returns a 0 terminated array of registers that can be safely used as
   /// scratch registers.
   virtual const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const {
     return nullptr;
   }
 
   /// This callback is used to prepare for a volatile or atomic load.
   /// It takes a chain node as input and returns the chain for the load itself.
   ///
   /// Having a callback like this is necessary for targets like SystemZ,
   /// which allows a CPU to reuse the result of a previous load indefinitely,
   /// even if a cache-coherent store is performed by another CPU.  The default
   /// implementation does nothing.
   virtual SDValue prepareVolatileOrAtomicLoad(SDValue Chain, const SDLoc &DL,
                                               SelectionDAG &DAG) const {
     return Chain;
   }
 
   /// This callback is used to inspect load/store instructions and add
   /// target-specific MachineMemOperand flags to them.  The default
   /// implementation does nothing.
   virtual MachineMemOperand::Flags getMMOFlags(const Instruction &I) const {
     return MachineMemOperand::MONone;
   }
 
   /// This callback is invoked by the type legalizer to legalize nodes with an
   /// illegal operand type but legal result types.  It replaces the
   /// LowerOperation callback in the type Legalizer.  The reason we can not do
   /// away with LowerOperation entirely is that LegalizeDAG isn't yet ready to
   /// use this callback.
   ///
   /// TODO: Consider merging with ReplaceNodeResults.
   ///
   /// The target places new result values for the node in Results (their number
   /// and types must exactly match those of the original return values of
   /// the node), or leaves Results empty, which indicates that the node is not
   /// to be custom lowered after all.
   /// The default implementation calls LowerOperation.
   virtual void LowerOperationWrapper(SDNode *N,
                                      SmallVectorImpl<SDValue> &Results,
                                      SelectionDAG &DAG) const;
 
   /// This callback is invoked for operations that are unsupported by the
   /// target, which are registered to use 'custom' lowering, and whose defined
   /// values are all legal.  If the target has no operations that require custom
   /// lowering, it need not implement this.  The default implementation of this
   /// aborts.
   virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
   /// This callback is invoked when a node result type is illegal for the
   /// target, and the operation was registered to use 'custom' lowering for that
   /// result type.  The target places new result values for the node in Results
   /// (their number and types must exactly match those of the original return
   /// values of the node), or leaves Results empty, which indicates that the
   /// node is not to be custom lowered after all.
   ///
   /// If the target has no operations that require custom lowering, it need not
   /// implement this.  The default implementation aborts.
   virtual void ReplaceNodeResults(SDNode * /*N*/,
                                   SmallVectorImpl<SDValue> &/*Results*/,
                                   SelectionDAG &/*DAG*/) const {
     llvm_unreachable("ReplaceNodeResults not implemented for this target!");
   }
 
   /// This method returns the name of a target specific DAG node.
   virtual const char *getTargetNodeName(unsigned Opcode) const;
 
   /// This method returns a target specific FastISel object, or null if the
   /// target does not support "fast" ISel.
   virtual FastISel *createFastISel(FunctionLoweringInfo &,
                                    const TargetLibraryInfo *) const {
     return nullptr;
   }
 
   bool verifyReturnAddressArgumentIsConstant(SDValue Op,
                                              SelectionDAG &DAG) const;
 
   //===--------------------------------------------------------------------===//
   // Inline Asm Support hooks
   //
 
   /// This hook allows the target to expand an inline asm call to be explicit
   /// llvm code if it wants to.  This is useful for turning simple inline asms
   /// into LLVM intrinsics, which gives the compiler more information about the
   /// behavior of the code.
   virtual bool ExpandInlineAsm(CallInst *) const {
     return false;
   }
 
   enum ConstraintType {
     C_Register,            // Constraint represents specific register(s).
     C_RegisterClass,       // Constraint represents any of register(s) in class.
     C_Memory,              // Memory constraint.
     C_Other,               // Something else.
     C_Unknown              // Unsupported constraint.
   };
 
   enum ConstraintWeight {
     // Generic weights.
     CW_Invalid  = -1,     // No match.
     CW_Okay     = 0,      // Acceptable.
     CW_Good     = 1,      // Good weight.
     CW_Better   = 2,      // Better weight.
     CW_Best     = 3,      // Best weight.
 
     // Well-known weights.
     CW_SpecificReg  = CW_Okay,    // Specific register operands.
     CW_Register     = CW_Good,    // Register operands.
     CW_Memory       = CW_Better,  // Memory operands.
     CW_Constant     = CW_Best,    // Constant operand.
     CW_Default      = CW_Okay     // Default or don't know type.
   };
 
   /// This contains information for each constraint that we are lowering.
   struct AsmOperandInfo : public InlineAsm::ConstraintInfo {
     /// This contains the actual string for the code, like "m".  TargetLowering
     /// picks the 'best' code from ConstraintInfo::Codes that most closely
     /// matches the operand.
     std::string ConstraintCode;
 
     /// Information about the constraint code, e.g. Register, RegisterClass,
     /// Memory, Other, Unknown.
     TargetLowering::ConstraintType ConstraintType = TargetLowering::C_Unknown;
 
     /// If this is the result output operand or a clobber, this is null,
     /// otherwise it is the incoming operand to the CallInst.  This gets
     /// modified as the asm is processed.
     Value *CallOperandVal = nullptr;
 
     /// The ValueType for the operand value.
     MVT ConstraintVT = MVT::Other;
 
     /// Copy constructor for copying from a ConstraintInfo.
     AsmOperandInfo(InlineAsm::ConstraintInfo Info)
         : InlineAsm::ConstraintInfo(std::move(Info)) {}
 
     /// Return true of this is an input operand that is a matching constraint
     /// like "4".
     bool isMatchingInputConstraint() const;
 
     /// If this is an input matching constraint, this method returns the output
     /// operand it matches.
     unsigned getMatchedOperand() const;
   };
 
   using AsmOperandInfoVector = std::vector<AsmOperandInfo>;
 
   /// Split up the constraint string from the inline assembly value into the
   /// specific constraints and their prefixes, and also tie in the associated
   /// operand values.  If this returns an empty vector, and if the constraint
   /// string itself isn't empty, there was an error parsing.
   virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL,
                                                 const TargetRegisterInfo *TRI,
                                                 ImmutableCallSite CS) const;
 
   /// Examine constraint type and operand type and determine a weight value.
   /// The operand object must already have been set up with the operand type.
   virtual ConstraintWeight getMultipleConstraintMatchWeight(
       AsmOperandInfo &info, int maIndex) const;
 
   /// Examine constraint string and operand type and determine a weight value.
   /// The operand object must already have been set up with the operand type.
   virtual ConstraintWeight getSingleConstraintMatchWeight(
       AsmOperandInfo &info, const char *constraint) const;
 
   /// Determines the constraint code and constraint type to use for the specific
   /// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType.
   /// If the actual operand being passed in is available, it can be passed in as
   /// Op, otherwise an empty SDValue can be passed.
   virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo,
                                       SDValue Op,
                                       SelectionDAG *DAG = nullptr) const;
 
   /// Given a constraint, return the type of constraint it is for this target.
   virtual ConstraintType getConstraintType(StringRef Constraint) const;
 
   /// Given a physical register constraint (e.g.  {edx}), return the register
   /// number and the register class for the register.
   ///
   /// Given a register class constraint, like 'r', if this corresponds directly
   /// to an LLVM register class, return a register of 0 and the register class
   /// pointer.
   ///
   /// This should only be used for C_Register constraints.  On error, this
   /// returns a register number of 0 and a null register class pointer.
   virtual std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const;
 
   virtual unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const {
     if (ConstraintCode == "i")
       return InlineAsm::Constraint_i;
     else if (ConstraintCode == "m")
       return InlineAsm::Constraint_m;
     return InlineAsm::Constraint_Unknown;
   }
 
   /// Try to replace an X constraint, which matches anything, with another that
   /// has more specific requirements based on the type of the corresponding
   /// operand.  This returns null if there is no replacement to make.
   virtual const char *LowerXConstraint(EVT ConstraintVT) const;
 
   /// Lower the specified operand into the Ops vector.  If it is invalid, don't
   /// add anything to Ops.
   virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                             std::vector<SDValue> &Ops,
                                             SelectionDAG &DAG) const;
 
   //===--------------------------------------------------------------------===//
   // Div utility functions
   //
   SDValue BuildSDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                     bool IsAfterLegalization,
                     SmallVectorImpl<SDNode *> &Created) const;
   SDValue BuildUDIV(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                     bool IsAfterLegalization,
                     SmallVectorImpl<SDNode *> &Created) const;
 
   /// Targets may override this function to provide custom SDIV lowering for
   /// power-of-2 denominators.  If the target returns an empty SDValue, LLVM
   /// assumes SDIV is expensive and replaces it with a series of other integer
   /// operations.
   virtual SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                 SelectionDAG &DAG,
                                 SmallVectorImpl<SDNode *> &Created) const;
 
   /// Indicate whether this target prefers to combine FDIVs with the same
   /// divisor. If the transform should never be done, return zero. If the
   /// transform should be done, return the minimum number of divisor uses
   /// that must exist.
   virtual unsigned combineRepeatedFPDivisors() const {
     return 0;
   }
 
   /// Hooks for building estimates in place of slower divisions and square
   /// roots.
 
   /// Return either a square root or its reciprocal estimate value for the input
   /// operand.
   /// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
   /// 'Enabled' as set by a potential default override attribute.
   /// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson
   /// refinement iterations required to generate a sufficient (though not
   /// necessarily IEEE-754 compliant) estimate is returned in that parameter.
   /// The boolean UseOneConstNR output is used to select a Newton-Raphson
   /// algorithm implementation that uses either one or two constants.
   /// The boolean Reciprocal is used to select whether the estimate is for the
   /// square root of the input operand or the reciprocal of its square root.
   /// A target may choose to implement its own refinement within this function.
   /// If that's true, then return '0' as the number of RefinementSteps to avoid
   /// any further refinement of the estimate.
   /// An empty SDValue return means no estimate sequence can be created.
   virtual SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
                                   int Enabled, int &RefinementSteps,
                                   bool &UseOneConstNR, bool Reciprocal) const {
     return SDValue();
   }
 
   /// Return a reciprocal estimate value for the input operand.
   /// \p Enabled is a ReciprocalEstimate enum with value either 'Unspecified' or
   /// 'Enabled' as set by a potential default override attribute.
   /// If \p RefinementSteps is 'Unspecified', the number of Newton-Raphson
   /// refinement iterations required to generate a sufficient (though not
   /// necessarily IEEE-754 compliant) estimate is returned in that parameter.
   /// A target may choose to implement its own refinement within this function.
   /// If that's true, then return '0' as the number of RefinementSteps to avoid
   /// any further refinement of the estimate.
   /// An empty SDValue return means no estimate sequence can be created.
   virtual SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
                                    int Enabled, int &RefinementSteps) const {
     return SDValue();
   }
 
   //===--------------------------------------------------------------------===//
   // Legalization utility functions
   //
 
   /// Expand a MUL or [US]MUL_LOHI of n-bit values into two or four nodes,
   /// respectively, each computing an n/2-bit part of the result.
   /// \param Result A vector that will be filled with the parts of the result
   ///        in little-endian order.
   /// \param LL Low bits of the LHS of the MUL.  You can use this parameter
   ///        if you want to control how low bits are extracted from the LHS.
   /// \param LH High bits of the LHS of the MUL.  See LL for meaning.
   /// \param RL Low bits of the RHS of the MUL.  See LL for meaning
   /// \param RH High bits of the RHS of the MUL.  See LL for meaning.
   /// \returns true if the node has been expanded, false if it has not
   bool expandMUL_LOHI(unsigned Opcode, EVT VT, SDLoc dl, SDValue LHS,
                       SDValue RHS, SmallVectorImpl<SDValue> &Result, EVT HiLoVT,
                       SelectionDAG &DAG, MulExpansionKind Kind,
                       SDValue LL = SDValue(), SDValue LH = SDValue(),
                       SDValue RL = SDValue(), SDValue RH = SDValue()) const;
 
   /// Expand a MUL into two nodes.  One that computes the high bits of
   /// the result and one that computes the low bits.
   /// \param HiLoVT The value type to use for the Lo and Hi nodes.
   /// \param LL Low bits of the LHS of the MUL.  You can use this parameter
   ///        if you want to control how low bits are extracted from the LHS.
   /// \param LH High bits of the LHS of the MUL.  See LL for meaning.
   /// \param RL Low bits of the RHS of the MUL.  See LL for meaning
   /// \param RH High bits of the RHS of the MUL.  See LL for meaning.
   /// \returns true if the node has been expanded. false if it has not
   bool expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
                  SelectionDAG &DAG, MulExpansionKind Kind,
                  SDValue LL = SDValue(), SDValue LH = SDValue(),
                  SDValue RL = SDValue(), SDValue RH = SDValue()) const;
 
   /// Expand float(f32) to SINT(i64) conversion
   /// \param N Node to expand
   /// \param Result output after conversion
   /// \returns True, if the expansion was successful, false otherwise
   bool expandFP_TO_SINT(SDNode *N, SDValue &Result, SelectionDAG &DAG) const;
 
   /// Turn load of vector type into a load of the individual elements.
   /// \param LD load to expand
   /// \returns MERGE_VALUEs of the scalar loads with their chains.
   SDValue scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const;
 
   // Turn a store of a vector type into stores of the individual elements.
   /// \param ST Store with a vector value type
   /// \returns MERGE_VALUs of the individual store chains.
   SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const;
 
   /// Expands an unaligned load to 2 half-size loads for an integer, and
   /// possibly more for vectors.
   std::pair<SDValue, SDValue> expandUnalignedLoad(LoadSDNode *LD,
                                                   SelectionDAG &DAG) const;
 
   /// Expands an unaligned store to 2 half-size stores for integer values, and
   /// possibly more for vectors.
   SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const;
 
   /// Increments memory address \p Addr according to the type of the value
   /// \p DataVT that should be stored. If the data is stored in compressed
   /// form, the memory address should be incremented according to the number of
   /// the stored elements. This number is equal to the number of '1's bits
   /// in the \p Mask.
   /// \p DataVT is a vector type. \p Mask is a vector value.
   /// \p DataVT and \p Mask have the same number of vector elements.
   SDValue IncrementMemoryAddress(SDValue Addr, SDValue Mask, const SDLoc &DL,
                                  EVT DataVT, SelectionDAG &DAG,
                                  bool IsCompressedMemory) const;
 
   /// Get a pointer to vector element \p Idx located in memory for a vector of
   /// type \p VecVT starting at a base address of \p VecPtr. If \p Idx is out of
   /// bounds the returned pointer is unspecified, but will be within the vector
   /// bounds.
   SDValue getVectorElementPointer(SelectionDAG &DAG, SDValue VecPtr, EVT VecVT,
                                   SDValue Index) const;
 
   //===--------------------------------------------------------------------===//
   // Instruction Emitting Hooks
   //
 
   /// This method should be implemented by targets that mark instructions with
   /// the 'usesCustomInserter' flag.  These instructions are special in various
   /// ways, which require special support to insert.  The specified MachineInstr
   /// is created but not inserted into any basic blocks, and this method is
   /// called to expand it into a sequence of instructions, potentially also
   /// creating new basic blocks and control flow.
   /// As long as the returned basic block is different (i.e., we created a new
   /// one), the custom inserter is free to modify the rest of \p MBB.
   virtual MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const;
 
   /// This method should be implemented by targets that mark instructions with
   /// the 'hasPostISelHook' flag. These instructions must be adjusted after
   /// instruction selection by target hooks.  e.g. To fill in optional defs for
   /// ARM 's' setting instructions.
   virtual void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                              SDNode *Node) const;
 
   /// If this function returns true, SelectionDAGBuilder emits a
   /// LOAD_STACK_GUARD node when it is lowering Intrinsic::stackprotector.
   virtual bool useLoadStackGuardNode() const {
     return false;
   }
 
   virtual SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
                                       const SDLoc &DL) const {
     llvm_unreachable("not implemented for this target");
   }
 
   /// Lower TLS global address SDNode for target independent emulated TLS model.
   virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
                                           SelectionDAG &DAG) const;
 
   /// Expands target specific indirect branch for the case of JumpTable
   /// expanasion.
   virtual SDValue expandIndirectJTBranch(const SDLoc& dl, SDValue Value, SDValue Addr,
                                          SelectionDAG &DAG) const {
     return DAG.getNode(ISD::BRIND, dl, MVT::Other, Value, Addr);
   }
 
   // seteq(x, 0) -> truncate(srl(ctlz(zext(x)), log2(#bits)))
   // If we're comparing for equality to zero and isCtlzFast is true, expose the
   // fact that this can be implemented as a ctlz/srl pair, so that the dag
   // combiner can fold the new nodes.
   SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const;
 
 private:
   SDValue simplifySetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
                                ISD::CondCode Cond, DAGCombinerInfo &DCI,
                                const SDLoc &DL) const;
 
   SDValue optimizeSetCCOfSignedTruncationCheck(EVT SCCVT, SDValue N0,
                                                SDValue N1, ISD::CondCode Cond,
                                                DAGCombinerInfo &DCI,
                                                const SDLoc &DL) const;
 };
 
 /// Given an LLVM IR type and return type attributes, compute the return value
 /// EVTs and flags, and optionally also the offsets, if the return value is
 /// being lowered to memory.
 void GetReturnInfo(CallingConv::ID CC, Type *ReturnType, AttributeList attr,
                    SmallVectorImpl<ISD::OutputArg> &Outs,
                    const TargetLowering &TLI, const DataLayout &DL);
 
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_TARGETLOWERING_H
Index: head/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- head/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(revision 344055)
+++ head/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp	(revision 344056)
@@ -1,18691 +1,18692 @@
 //===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This pass combines dag nodes to form fewer, simpler DAG nodes.  It can be run
 // both before and after the DAG is legalized.
 //
 // This pass is not a substitute for the LLVM IR instcombine pass. This pass is
 // primarily intended to handle simplification opportunities that are implicit
 // in the LLVM IR and exposed by the various codegen lowering phases.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <functional>
 #include <iterator>
 #include <string>
 #include <tuple>
 #include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "dagcombine"
 
 STATISTIC(NodesCombined   , "Number of dag nodes combined");
 STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
 STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
 STATISTIC(LdStFP2Int      , "Number of fp load/store pairs transformed to int");
 STATISTIC(SlicedLoads, "Number of load sliced");
 
 static cl::opt<bool>
 CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
                  cl::desc("Enable DAG combiner's use of IR alias analysis"));
 
 static cl::opt<bool>
 UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
         cl::desc("Enable DAG combiner's use of TBAA"));
 
 #ifndef NDEBUG
 static cl::opt<std::string>
 CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
                    cl::desc("Only use DAG-combiner alias analysis in this"
                             " function"));
 #endif
 
 /// Hidden option to stress test load slicing, i.e., when this option
 /// is enabled, load slicing bypasses most of its profitability guards.
 static cl::opt<bool>
 StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
                   cl::desc("Bypass the profitability model of load slicing"),
                   cl::init(false));
 
 static cl::opt<bool>
   MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
                     cl::desc("DAG combiner may split indexing from loads"));
 
 namespace {
 
   class DAGCombiner {
     SelectionDAG &DAG;
     const TargetLowering &TLI;
     CombineLevel Level;
     CodeGenOpt::Level OptLevel;
     bool LegalOperations = false;
     bool LegalTypes = false;
     bool ForCodeSize;
 
     /// Worklist of all of the nodes that need to be simplified.
     ///
     /// This must behave as a stack -- new nodes to process are pushed onto the
     /// back and when processing we pop off of the back.
     ///
     /// The worklist will not contain duplicates but may contain null entries
     /// due to nodes being deleted from the underlying DAG.
     SmallVector<SDNode *, 64> Worklist;
 
     /// Mapping from an SDNode to its position on the worklist.
     ///
     /// This is used to find and remove nodes from the worklist (by nulling
     /// them) when they are deleted from the underlying DAG. It relies on
     /// stable indices of nodes within the worklist.
     DenseMap<SDNode *, unsigned> WorklistMap;
 
     /// Set of nodes which have been combined (at least once).
     ///
     /// This is used to allow us to reliably add any operands of a DAG node
     /// which have not yet been combined to the worklist.
     SmallPtrSet<SDNode *, 32> CombinedNodes;
 
     // AA - Used for DAG load/store alias analysis.
     AliasAnalysis *AA;
 
     /// When an instruction is simplified, add all users of the instruction to
     /// the work lists because they might get more simplified now.
     void AddUsersToWorklist(SDNode *N) {
       for (SDNode *Node : N->uses())
         AddToWorklist(Node);
     }
 
     /// Call the node-specific routine that folds each particular type of node.
     SDValue visit(SDNode *N);
 
   public:
     DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
         : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
           OptLevel(OL), AA(AA) {
       ForCodeSize = DAG.getMachineFunction().getFunction().optForSize();
 
       MaximumLegalStoreInBits = 0;
       for (MVT VT : MVT::all_valuetypes())
         if (EVT(VT).isSimple() && VT != MVT::Other &&
             TLI.isTypeLegal(EVT(VT)) &&
             VT.getSizeInBits() >= MaximumLegalStoreInBits)
           MaximumLegalStoreInBits = VT.getSizeInBits();
     }
 
     /// Add to the worklist making sure its instance is at the back (next to be
     /// processed.)
     void AddToWorklist(SDNode *N) {
       assert(N->getOpcode() != ISD::DELETED_NODE &&
              "Deleted Node added to Worklist");
 
       // Skip handle nodes as they can't usefully be combined and confuse the
       // zero-use deletion strategy.
       if (N->getOpcode() == ISD::HANDLENODE)
         return;
 
       if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
         Worklist.push_back(N);
     }
 
     /// Remove all instances of N from the worklist.
     void removeFromWorklist(SDNode *N) {
       CombinedNodes.erase(N);
 
       auto It = WorklistMap.find(N);
       if (It == WorklistMap.end())
         return; // Not in the worklist.
 
       // Null out the entry rather than erasing it to avoid a linear operation.
       Worklist[It->second] = nullptr;
       WorklistMap.erase(It);
     }
 
     void deleteAndRecombine(SDNode *N);
     bool recursivelyDeleteUnusedNodes(SDNode *N);
 
     /// Replaces all uses of the results of one DAG node with new values.
     SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
                       bool AddTo = true);
 
     /// Replaces all uses of the results of one DAG node with new values.
     SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
       return CombineTo(N, &Res, 1, AddTo);
     }
 
     /// Replaces all uses of the results of one DAG node with new values.
     SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
                       bool AddTo = true) {
       SDValue To[] = { Res0, Res1 };
       return CombineTo(N, To, 2, AddTo);
     }
 
     void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
 
   private:
     unsigned MaximumLegalStoreInBits;
 
     /// Check the specified integer node value to see if it can be simplified or
     /// if things it uses can be simplified by bit propagation.
     /// If so, return true.
     bool SimplifyDemandedBits(SDValue Op) {
       unsigned BitWidth = Op.getScalarValueSizeInBits();
       APInt Demanded = APInt::getAllOnesValue(BitWidth);
       return SimplifyDemandedBits(Op, Demanded);
     }
 
     /// Check the specified vector node value to see if it can be simplified or
     /// if things it uses can be simplified as it only uses some of the
     /// elements. If so, return true.
     bool SimplifyDemandedVectorElts(SDValue Op) {
       unsigned NumElts = Op.getValueType().getVectorNumElements();
       APInt Demanded = APInt::getAllOnesValue(NumElts);
       return SimplifyDemandedVectorElts(Op, Demanded);
     }
 
     bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded);
     bool SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded,
                                     bool AssumeSingleUse = false);
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
     SDValue SplitIndexingFromLoad(LoadSDNode *LD);
     bool SliceUpLoad(SDNode *N);
 
     /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
     ///   load.
     ///
     /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
     /// \param InVecVT type of the input vector to EVE with bitcasts resolved.
     /// \param EltNo index of the vector element to load.
     /// \param OriginalLoad load that EVE came from to be replaced.
     /// \returns EVE on success SDValue() on failure.
     SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
         SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad);
     void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
     SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
     SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
     SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
     SDValue PromoteIntBinOp(SDValue Op);
     SDValue PromoteIntShiftOp(SDValue Op);
     SDValue PromoteExtend(SDValue Op);
     bool PromoteLoad(SDValue Op);
 
     /// Call the node-specific routine that knows how to fold each
     /// particular type of node. If that doesn't do anything, try the
     /// target-specific DAG combines.
     SDValue combine(SDNode *N);
 
     // Visitation implementation - Implement dag node combining for different
     // node types.  The semantics are as follows:
     // Return Value:
     //   SDValue.getNode() == 0 - No change was made
     //   SDValue.getNode() == N - N was replaced, is dead and has been handled.
     //   otherwise              - N should be replaced by the returned Operand.
     //
     SDValue visitTokenFactor(SDNode *N);
     SDValue visitMERGE_VALUES(SDNode *N);
     SDValue visitADD(SDNode *N);
     SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference);
     SDValue visitSUB(SDNode *N);
     SDValue visitADDC(SDNode *N);
     SDValue visitUADDO(SDNode *N);
     SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitSUBC(SDNode *N);
     SDValue visitUSUBO(SDNode *N);
     SDValue visitADDE(SDNode *N);
     SDValue visitADDCARRY(SDNode *N);
     SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N);
     SDValue visitSUBE(SDNode *N);
     SDValue visitSUBCARRY(SDNode *N);
     SDValue visitMUL(SDNode *N);
     SDValue useDivRem(SDNode *N);
     SDValue visitSDIV(SDNode *N);
     SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitUDIV(SDNode *N);
     SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitREM(SDNode *N);
     SDValue visitMULHU(SDNode *N);
     SDValue visitMULHS(SDNode *N);
     SDValue visitSMUL_LOHI(SDNode *N);
     SDValue visitUMUL_LOHI(SDNode *N);
     SDValue visitSMULO(SDNode *N);
     SDValue visitUMULO(SDNode *N);
     SDValue visitIMINMAX(SDNode *N);
     SDValue visitAND(SDNode *N);
     SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitOR(SDNode *N);
     SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N);
     SDValue visitXOR(SDNode *N);
     SDValue SimplifyVBinOp(SDNode *N);
     SDValue visitSHL(SDNode *N);
     SDValue visitSRA(SDNode *N);
     SDValue visitSRL(SDNode *N);
     SDValue visitRotate(SDNode *N);
     SDValue visitABS(SDNode *N);
     SDValue visitBSWAP(SDNode *N);
     SDValue visitBITREVERSE(SDNode *N);
     SDValue visitCTLZ(SDNode *N);
     SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
     SDValue visitCTTZ(SDNode *N);
     SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
     SDValue visitCTPOP(SDNode *N);
     SDValue visitSELECT(SDNode *N);
     SDValue visitVSELECT(SDNode *N);
     SDValue visitSELECT_CC(SDNode *N);
     SDValue visitSETCC(SDNode *N);
     SDValue visitSETCCCARRY(SDNode *N);
     SDValue visitSIGN_EXTEND(SDNode *N);
     SDValue visitZERO_EXTEND(SDNode *N);
     SDValue visitANY_EXTEND(SDNode *N);
     SDValue visitAssertExt(SDNode *N);
     SDValue visitSIGN_EXTEND_INREG(SDNode *N);
     SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
     SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
     SDValue visitTRUNCATE(SDNode *N);
     SDValue visitBITCAST(SDNode *N);
     SDValue visitBUILD_PAIR(SDNode *N);
     SDValue visitFADD(SDNode *N);
     SDValue visitFSUB(SDNode *N);
     SDValue visitFMUL(SDNode *N);
     SDValue visitFMA(SDNode *N);
     SDValue visitFDIV(SDNode *N);
     SDValue visitFREM(SDNode *N);
     SDValue visitFSQRT(SDNode *N);
     SDValue visitFCOPYSIGN(SDNode *N);
     SDValue visitSINT_TO_FP(SDNode *N);
     SDValue visitUINT_TO_FP(SDNode *N);
     SDValue visitFP_TO_SINT(SDNode *N);
     SDValue visitFP_TO_UINT(SDNode *N);
     SDValue visitFP_ROUND(SDNode *N);
     SDValue visitFP_ROUND_INREG(SDNode *N);
     SDValue visitFP_EXTEND(SDNode *N);
     SDValue visitFNEG(SDNode *N);
     SDValue visitFABS(SDNode *N);
     SDValue visitFCEIL(SDNode *N);
     SDValue visitFTRUNC(SDNode *N);
     SDValue visitFFLOOR(SDNode *N);
     SDValue visitFMINNUM(SDNode *N);
     SDValue visitFMAXNUM(SDNode *N);
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
 
     SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
     SDValue replaceStoreOfFPConstant(StoreSDNode *ST);
 
     SDValue visitSTORE(SDNode *N);
     SDValue visitINSERT_VECTOR_ELT(SDNode *N);
     SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
     SDValue visitBUILD_VECTOR(SDNode *N);
     SDValue visitCONCAT_VECTORS(SDNode *N);
     SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
     SDValue visitSCALAR_TO_VECTOR(SDNode *N);
     SDValue visitINSERT_SUBVECTOR(SDNode *N);
     SDValue visitMLOAD(SDNode *N);
     SDValue visitMSTORE(SDNode *N);
     SDValue visitMGATHER(SDNode *N);
     SDValue visitMSCATTER(SDNode *N);
     SDValue visitFP_TO_FP16(SDNode *N);
     SDValue visitFP16_TO_FP(SDNode *N);
 
     SDValue visitFADDForFMACombine(SDNode *N);
     SDValue visitFSUBForFMACombine(SDNode *N);
     SDValue visitFMULForFMADistributiveCombine(SDNode *N);
 
     SDValue XformToShuffleWithZero(SDNode *N);
     SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                            SDValue N1);
 
     SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
 
     SDValue foldSelectOfConstants(SDNode *N);
     SDValue foldVSelectOfConstants(SDNode *N);
     SDValue foldBinOpIntoSelect(SDNode *BO);
     bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
     SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
     SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
     SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
                              SDValue N2, SDValue N3, ISD::CondCode CC,
                              bool NotExtCompare = false);
     SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
                                    SDValue N2, SDValue N3, ISD::CondCode CC);
     SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
                               const SDLoc &DL);
     SDValue unfoldMaskedMerge(SDNode *N);
     SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
                           const SDLoc &DL, bool foldBooleans);
     SDValue rebuildSetCC(SDValue N);
 
     bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
                            SDValue &CC) const;
     bool isOneUseSetCC(SDValue N) const;
 
     SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                          unsigned HiOp);
     SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
     SDValue CombineExtLoad(SDNode *N);
     SDValue CombineZExtLogicopShiftLoad(SDNode *N);
     SDValue combineRepeatedFPDivisors(SDNode *N);
     SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
     SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
     SDValue BuildSDIV(SDNode *N);
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
     SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
     SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
     SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations,
                                 SDNodeFlags Flags, bool Reciprocal);
     SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations,
                                 SDNodeFlags Flags, bool Reciprocal);
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
     SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
                               SDValue InnerPos, SDValue InnerNeg,
                               unsigned PosOpcode, unsigned NegOpcode,
                               const SDLoc &DL);
     SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue MatchLoadCombine(SDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
     SDValue TransformFPLoadStorePair(SDNode *N);
     SDValue convertBuildVecZextToZext(SDNode *N);
     SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
     SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
     SDValue reduceBuildVecToShuffle(SDNode *N);
     SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                   ArrayRef<int> VectorMask, SDValue VecIn1,
                                   SDValue VecIn2, unsigned LeftIdx);
     SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
 
     /// Walk up chain skipping non-aliasing memory nodes,
     /// looking for aliasing nodes and adding them to the Aliases vector.
     void GatherAllAliases(SDNode *N, SDValue OriginalChain,
                           SmallVectorImpl<SDValue> &Aliases);
 
     /// Return true if there is any possibility that the two addresses overlap.
     bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const;
 
     /// Walk up chain skipping non-aliasing memory nodes, looking for a better
     /// chain (aliasing node.)
     SDValue FindBetterChain(SDNode *N, SDValue Chain);
 
     /// Try to replace a store and any possibly adjacent stores on
     /// consecutive chains with better chains. Return true only if St is
     /// replaced.
     ///
     /// Notice that other chains may still be replaced even if the function
     /// returns false.
     bool findBetterNeighborChains(StoreSDNode *St);
 
     /// Holds a pointer to an LSBaseSDNode as well as information on where it
     /// is located in a sequence of memory operations connected by a chain.
     struct MemOpLink {
       // Ptr to the mem node.
       LSBaseSDNode *MemNode;
 
       // Offset from the base ptr.
       int64_t OffsetFromBase;
 
       MemOpLink(LSBaseSDNode *N, int64_t Offset)
           : MemNode(N), OffsetFromBase(Offset) {}
     };
 
     /// This is a helper function for visitMUL to check the profitability
     /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
     /// MulNode is the original multiply, AddNode is (add x, c1),
     /// and ConstNode is c2.
     bool isMulAddWithConstProfitable(SDNode *MulNode,
                                      SDValue &AddNode,
                                      SDValue &ConstNode);
 
     /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns
     /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns
     /// the type of the loaded value to be extended.
     bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
                           EVT LoadResultTy, EVT &ExtVT);
 
     /// Helper function to calculate whether the given Load/Store can have its
     /// width reduced to ExtVT.
     bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType,
                            EVT &MemVT, unsigned ShAmt = 0);
 
     /// Used by BackwardsPropagateMask to find suitable loads.
     bool SearchForAndLoads(SDNode *N, SmallPtrSetImpl<LoadSDNode*> &Loads,
                            SmallPtrSetImpl<SDNode*> &NodesWithConsts,
                            ConstantSDNode *Mask, SDNode *&NodeToMask);
     /// Attempt to propagate a given AND node back to load leaves so that they
     /// can be combined into narrow loads.
     bool BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG);
 
     /// Helper function for MergeConsecutiveStores which merges the
     /// component store chains.
     SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
                                 unsigned NumStores);
 
     /// This is a helper function for MergeConsecutiveStores. When the
     /// source elements of the consecutive stores are all constants or
     /// all extracted vector elements, try to merge them into one
     /// larger store introducing bitcasts if necessary.  \return True
     /// if a merged store was created.
     bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
                                          EVT MemVT, unsigned NumStores,
                                          bool IsConstantSrc, bool UseVector,
                                          bool UseTrunc);
 
     /// This is a helper function for MergeConsecutiveStores. Stores
     /// that potentially may be merged with St are placed in
     /// StoreNodes. RootNode is a chain predecessor to all store
     /// candidates.
     void getStoreMergeCandidates(StoreSDNode *St,
                                  SmallVectorImpl<MemOpLink> &StoreNodes,
                                  SDNode *&Root);
 
     /// Helper function for MergeConsecutiveStores. Checks if
     /// candidate stores have indirect dependency through their
     /// operands. RootNode is the predecessor to all stores calculated
     /// by getStoreMergeCandidates and is used to prune the dependency check.
     /// \return True if safe to merge.
     bool checkMergeStoreCandidatesForDependencies(
         SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
         SDNode *RootNode);
 
     /// Merge consecutive store operations into a wide store.
     /// This optimization uses wide integers or vectors when possible.
     /// \return number of stores that were merged into a merged store (the
     /// affected nodes are stored as a prefix in \p StoreNodes).
     bool MergeConsecutiveStores(StoreSDNode *St);
 
     /// Try to transform a truncation where C is a constant:
     ///     (trunc (and X, C)) -> (and (trunc X), (trunc C))
     ///
     /// \p N needs to be a truncation and its first operand an AND. Other
     /// requirements are checked by the function (e.g. that trunc is
     /// single-use) and if missed an empty SDValue is returned.
     SDValue distributeTruncateThroughAnd(SDNode *N);
 
     /// Helper function to determine whether the target supports operation
     /// given by \p Opcode for type \p VT, that is, whether the operation
     /// is legal or custom before legalizing operations, and whether is
     /// legal (but not custom) after legalization.
     bool hasOperation(unsigned Opcode, EVT VT) {
       if (LegalOperations)
         return TLI.isOperationLegal(Opcode, VT);
       return TLI.isOperationLegalOrCustom(Opcode, VT);
     }
 
   public:
     /// Runs the dag combiner on all nodes in the work list
     void Run(CombineLevel AtLevel);
 
     SelectionDAG &getDAG() const { return DAG; }
 
     /// Returns a type large enough to hold any valid shift amount - before type
     /// legalization these can be huge.
     EVT getShiftAmountTy(EVT LHSTy) {
       assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
       return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes);
     }
 
     /// This method returns true if we are running before type legalization or
     /// if the specified VT is legal.
     bool isTypeLegal(const EVT &VT) {
       if (!LegalTypes) return true;
       return TLI.isTypeLegal(VT);
     }
 
     /// Convenience wrapper around TargetLowering::getSetCCResultType
     EVT getSetCCResultType(EVT VT) const {
       return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
     }
 
     void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
                          SDValue OrigLoad, SDValue ExtLoad,
                          ISD::NodeType ExtType);
   };
 
 /// This class is a DAGUpdateListener that removes any deleted
 /// nodes from the worklist.
 class WorklistRemover : public SelectionDAG::DAGUpdateListener {
   DAGCombiner &DC;
 
 public:
   explicit WorklistRemover(DAGCombiner &dc)
     : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
 
   void NodeDeleted(SDNode *N, SDNode *E) override {
     DC.removeFromWorklist(N);
   }
 };
 
 } // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 //  TargetLowering::DAGCombinerInfo implementation
 //===----------------------------------------------------------------------===//
 
 void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
   ((DAGCombiner*)DC)->AddToWorklist(N);
 }
 
 SDValue TargetLowering::DAGCombinerInfo::
 CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
   return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
 }
 
 SDValue TargetLowering::DAGCombinerInfo::
 CombineTo(SDNode *N, SDValue Res, bool AddTo) {
   return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
 }
 
 SDValue TargetLowering::DAGCombinerInfo::
 CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
   return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
 }
 
 void TargetLowering::DAGCombinerInfo::
 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
 }
 
 //===----------------------------------------------------------------------===//
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
 void DAGCombiner::deleteAndRecombine(SDNode *N) {
   removeFromWorklist(N);
 
   // If the operands of this node are only used by the node, they will now be
   // dead. Make sure to re-visit them and recursively delete dead nodes.
   for (const SDValue &Op : N->ops())
     // For an operand generating multiple values, one of the values may
     // become dead allowing further simplification (e.g. split index
     // arithmetic from an indexed load).
     if (Op->hasOneUse() || Op->getNumValues() > 1)
       AddToWorklist(Op.getNode());
 
   DAG.DeleteNode(N);
 }
 
 /// Return 1 if we can compute the negated form of the specified expression for
 /// the same cost as the expression itself, or 2 if we can compute the negated
 /// form more cheaply than the expression itself.
 static char isNegatibleForFree(SDValue Op, bool LegalOperations,
                                const TargetLowering &TLI,
                                const TargetOptions *Options,
                                unsigned Depth = 0) {
   // fneg is removable even if it has multiple uses.
   if (Op.getOpcode() == ISD::FNEG) return 2;
 
   // Don't allow anything with multiple uses unless we know it is free.
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
   if (!Op.hasOneUse())
     if (!(Op.getOpcode() == ISD::FP_EXTEND &&
           TLI.isFPExtFree(VT, Op.getOperand(0).getValueType())))
       return 0;
 
   // Don't recurse exponentially.
   if (Depth > 6) return 0;
 
   switch (Op.getOpcode()) {
   default: return false;
   case ISD::ConstantFP: {
     if (!LegalOperations)
       return 1;
 
     // Don't invert constant FP values after legalization unless the target says
     // the negated constant is legal.
     return TLI.isOperationLegal(ISD::ConstantFP, VT) ||
       TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT);
   }
   case ISD::FADD:
     if (!Options->UnsafeFPMath && !Flags.hasNoSignedZeros())
       return 0;
 
     // After operation legalization, it might not be legal to create new FSUBs.
     if (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
       return 0;
 
     // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
     if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
                                     Options, Depth + 1))
       return V;
     // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
     return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
                               Depth + 1);
   case ISD::FSUB:
     // We can't turn -(A-B) into B-A when we honor signed zeros.
     if (!Options->NoSignedZerosFPMath &&
         !Flags.hasNoSignedZeros())
       return 0;
 
     // fold (fneg (fsub A, B)) -> (fsub B, A)
     return 1;
 
   case ISD::FMUL:
   case ISD::FDIV:
     // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
     if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
                                     Options, Depth + 1))
       return V;
 
     return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
                               Depth + 1);
 
   case ISD::FP_EXTEND:
   case ISD::FP_ROUND:
   case ISD::FSIN:
     return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,
                               Depth + 1);
   }
 }
 
 /// If isNegatibleForFree returns true, return the newly negated expression.
 static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
                                     bool LegalOperations, unsigned Depth = 0) {
   const TargetOptions &Options = DAG.getTarget().Options;
   // fneg is removable even if it has multiple uses.
   if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0);
 
   assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");
 
   const SDNodeFlags Flags = Op.getNode()->getFlags();
 
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Unknown code");
   case ISD::ConstantFP: {
     APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
     V.changeSign();
     return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType());
   }
   case ISD::FADD:
     assert(Options.UnsafeFPMath || Flags.hasNoSignedZeros());
 
     // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
                            DAG.getTargetLoweringInfo(), &Options, Depth+1))
       return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
                          Op.getOperand(1), Flags);
     // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
     return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                        GetNegatedExpression(Op.getOperand(1), DAG,
                                             LegalOperations, Depth+1),
                        Op.getOperand(0), Flags);
   case ISD::FSUB:
     // fold (fneg (fsub 0, B)) -> B
     if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0)))
       if (N0CFP->isZero())
         return Op.getOperand(1);
 
     // fold (fneg (fsub A, B)) -> (fsub B, A)
     return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(0), Flags);
 
   case ISD::FMUL:
   case ISD::FDIV:
     // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
                            DAG.getTargetLoweringInfo(), &Options, Depth+1))
       return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
                          Op.getOperand(1), Flags);
 
     // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
     return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
                        Op.getOperand(0),
                        GetNegatedExpression(Op.getOperand(1), DAG,
                                             LegalOperations, Depth+1), Flags);
 
   case ISD::FP_EXTEND:
   case ISD::FSIN:
     return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
                        GetNegatedExpression(Op.getOperand(0), DAG,
                                             LegalOperations, Depth+1));
   case ISD::FP_ROUND:
       return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
                          Op.getOperand(1));
   }
 }
 
 // APInts must be the same size for most operations, this helper
 // function zero extends the shorter of the pair so that they match.
 // We provide an Offset so that we can create bitwidths that won't overflow.
 static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {
   unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth());
   LHS = LHS.zextOrSelf(Bits);
   RHS = RHS.zextOrSelf(Bits);
 }
 
 // Return true if this node is a setcc, or is a select_cc
 // that selects between the target values used for true and false, making it
 // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
 // the appropriate nodes based on the type of node we are checking. This
 // simplifies life a bit for the callers.
 bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
                                     SDValue &CC) const {
   if (N.getOpcode() == ISD::SETCC) {
     LHS = N.getOperand(0);
     RHS = N.getOperand(1);
     CC  = N.getOperand(2);
     return true;
   }
 
   if (N.getOpcode() != ISD::SELECT_CC ||
       !TLI.isConstTrueVal(N.getOperand(2).getNode()) ||
       !TLI.isConstFalseVal(N.getOperand(3).getNode()))
     return false;
 
   if (TLI.getBooleanContents(N.getValueType()) ==
       TargetLowering::UndefinedBooleanContent)
     return false;
 
   LHS = N.getOperand(0);
   RHS = N.getOperand(1);
   CC  = N.getOperand(4);
   return true;
 }
 
 /// Return true if this is a SetCC-equivalent operation with only one use.
 /// If this is true, it allows the users to invert the operation for free when
 /// it is profitable to do so.
 bool DAGCombiner::isOneUseSetCC(SDValue N) const {
   SDValue N0, N1, N2;
   if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
     return true;
   return false;
 }
 
 static SDValue peekThroughBitcast(SDValue V) {
   while (V.getOpcode() == ISD::BITCAST)
     V = V.getOperand(0);
   return V;
 }
 
 // Returns the SDNode if it is a constant float BuildVector
 // or constant float.
 static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
   if (isa<ConstantFPSDNode>(N))
     return N.getNode();
   if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
     return N.getNode();
   return nullptr;
 }
 
 // Determines if it is a constant integer or a build vector of constant
 // integers (and undefs).
 // Do not permit build vector implicit truncation.
 static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
   if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N))
     return !(Const->isOpaque() && NoOpaques);
   if (N.getOpcode() != ISD::BUILD_VECTOR)
     return false;
   unsigned BitWidth = N.getScalarValueSizeInBits();
   for (const SDValue &Op : N->op_values()) {
     if (Op.isUndef())
       continue;
     ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op);
     if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth ||
         (Const->isOpaque() && NoOpaques))
       return false;
   }
   return true;
 }
 
 // Determines if it is a constant null integer or a splatted vector of a
 // constant null integer (with no undefs).
 // Build vector implicit truncation is not an issue for null values.
 static bool isNullConstantOrNullSplatConstant(SDValue N) {
   // TODO: may want to use peekThroughBitcast() here.
   if (ConstantSDNode *Splat = isConstOrConstSplat(N))
     return Splat->isNullValue();
   return false;
 }
 
 // Determines if it is a constant integer of one or a splatted vector of a
 // constant integer of one (with no undefs).
 // Do not permit build vector implicit truncation.
 static bool isOneConstantOrOneSplatConstant(SDValue N) {
   // TODO: may want to use peekThroughBitcast() here.
   unsigned BitWidth = N.getScalarValueSizeInBits();
   if (ConstantSDNode *Splat = isConstOrConstSplat(N))
     return Splat->isOne() && Splat->getAPIntValue().getBitWidth() == BitWidth;
   return false;
 }
 
 // Determines if it is a constant integer of all ones or a splatted vector of a
 // constant integer of all ones (with no undefs).
 // Do not permit build vector implicit truncation.
 static bool isAllOnesConstantOrAllOnesSplatConstant(SDValue N) {
   N = peekThroughBitcast(N);
   unsigned BitWidth = N.getScalarValueSizeInBits();
   if (ConstantSDNode *Splat = isConstOrConstSplat(N))
     return Splat->isAllOnesValue() &&
            Splat->getAPIntValue().getBitWidth() == BitWidth;
   return false;
 }
 
 // Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with
 // undef's.
 static bool isAnyConstantBuildVector(const SDNode *N) {
   return ISD::isBuildVectorOfConstantSDNodes(N) ||
          ISD::isBuildVectorOfConstantFPSDNodes(N);
 }
 
 SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                                     SDValue N1) {
   EVT VT = N0.getValueType();
   if (N0.getOpcode() == Opc) {
     if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
       if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
         // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
         if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, L, R))
           return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
         return SDValue();
       }
       if (N0.hasOneUse()) {
         // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one
         // use
         SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
         if (!OpNode.getNode())
           return SDValue();
         AddToWorklist(OpNode.getNode());
         return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
       }
     }
   }
 
   if (N1.getOpcode() == Opc) {
     if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) {
       if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
         // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
         if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, R, L))
           return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
         return SDValue();
       }
       if (N1.hasOneUse()) {
         // reassoc. (op x, (op y, c1)) -> (op (op x, y), c1) iff x+c1 has one
         // use
         SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0, N1.getOperand(0));
         if (!OpNode.getNode())
           return SDValue();
         AddToWorklist(OpNode.getNode());
         return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1));
       }
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
                                bool AddTo) {
   assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
   ++NodesCombined;
   LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: ";
              To[0].getNode()->dump(&DAG);
              dbgs() << " and " << NumTo - 1 << " other values\n");
   for (unsigned i = 0, e = NumTo; i != e; ++i)
     assert((!To[i].getNode() ||
             N->getValueType(i) == To[i].getValueType()) &&
            "Cannot combine value to value of different type!");
 
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesWith(N, To);
   if (AddTo) {
     // Push the new nodes and any users onto the worklist
     for (unsigned i = 0, e = NumTo; i != e; ++i) {
       if (To[i].getNode()) {
         AddToWorklist(To[i].getNode());
         AddUsersToWorklist(To[i].getNode());
       }
     }
   }
 
   // Finally, if the node is now dead, remove it from the graph.  The node
   // may not be dead if the replacement process recursively simplified to
   // something else needing this node.
   if (N->use_empty())
     deleteAndRecombine(N);
   return SDValue(N, 0);
 }
 
 void DAGCombiner::
 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   // Replace all uses.  If any nodes become isomorphic to other nodes and
   // are deleted, make sure to remove them from our worklist.
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
 
   // Push the new node and any (possibly new) users onto the worklist.
   AddToWorklist(TLO.New.getNode());
   AddUsersToWorklist(TLO.New.getNode());
 
   // Finally, if the node is now dead, remove it from the graph.  The node
   // may not be dead if the replacement process recursively simplified to
   // something else needing this node.
   if (TLO.Old.getNode()->use_empty())
     deleteAndRecombine(TLO.Old.getNode());
 }
 
 /// Check the specified integer node value to see if it can be simplified or if
 /// things it uses can be simplified by bit propagation. If so, return true.
 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
   TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
   KnownBits Known;
   if (!TLI.SimplifyDemandedBits(Op, Demanded, Known, TLO))
     return false;
 
   // Revisit the node.
   AddToWorklist(Op.getNode());
 
   // Replace the old value with the new one.
   ++NodesCombined;
   LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
              dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
              dbgs() << '\n');
 
   CommitTargetLoweringOpt(TLO);
   return true;
 }
 
 /// Check the specified vector node value to see if it can be simplified or
 /// if things it uses can be simplified as it only uses some of the elements.
 /// If so, return true.
 bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op, const APInt &Demanded,
                                              bool AssumeSingleUse) {
   TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
   APInt KnownUndef, KnownZero;
   if (!TLI.SimplifyDemandedVectorElts(Op, Demanded, KnownUndef, KnownZero, TLO,
                                       0, AssumeSingleUse))
     return false;
 
   // Revisit the node.
   AddToWorklist(Op.getNode());
 
   // Replace the old value with the new one.
   ++NodesCombined;
   LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
              dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
              dbgs() << '\n');
 
   CommitTargetLoweringOpt(TLO);
   return true;
 }
 
 void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
   SDLoc DL(Load);
   EVT VT = Load->getValueType(0);
   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));
 
   LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: ";
              Trunc.getNode()->dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
   DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
   deleteAndRecombine(Load);
   AddToWorklist(Trunc.getNode());
 }
 
 SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
   Replace = false;
   SDLoc DL(Op);
   if (ISD::isUNINDEXEDLoad(Op.getNode())) {
     LoadSDNode *LD = cast<LoadSDNode>(Op);
     EVT MemVT = LD->getMemoryVT();
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
                                                       : LD->getExtensionType();
     Replace = true;
     return DAG.getExtLoad(ExtType, DL, PVT,
                           LD->getChain(), LD->getBasePtr(),
                           MemVT, LD->getMemOperand());
   }
 
   unsigned Opc = Op.getOpcode();
   switch (Opc) {
   default: break;
   case ISD::AssertSext:
     if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT))
       return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1));
     break;
   case ISD::AssertZext:
     if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT))
       return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1));
     break;
   case ISD::Constant: {
     unsigned ExtOpc =
       Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     return DAG.getNode(ExtOpc, DL, PVT, Op);
   }
   }
 
   if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
     return SDValue();
   return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op);
 }
 
 SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
   if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
     return SDValue();
   EVT OldVT = Op.getValueType();
   SDLoc DL(Op);
   bool Replace = false;
   SDValue NewOp = PromoteOperand(Op, PVT, Replace);
   if (!NewOp.getNode())
     return SDValue();
   AddToWorklist(NewOp.getNode());
 
   if (Replace)
     ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
   return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp,
                      DAG.getValueType(OldVT));
 }
 
 SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
   EVT OldVT = Op.getValueType();
   SDLoc DL(Op);
   bool Replace = false;
   SDValue NewOp = PromoteOperand(Op, PVT, Replace);
   if (!NewOp.getNode())
     return SDValue();
   AddToWorklist(NewOp.getNode());
 
   if (Replace)
     ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
   return DAG.getZeroExtendInReg(NewOp, DL, OldVT);
 }
 
 /// Promote the specified integer binary operation if the target indicates it is
 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to
 /// i32 since i16 instructions are longer.
 SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
   if (!LegalOperations)
     return SDValue();
 
   EVT VT = Op.getValueType();
   if (VT.isVector() || !VT.isInteger())
     return SDValue();
 
   // If operation type is 'undesirable', e.g. i16 on x86, consider
   // promoting it.
   unsigned Opc = Op.getOpcode();
   if (TLI.isTypeDesirableForOp(Opc, VT))
     return SDValue();
 
   EVT PVT = VT;
   // Consult target whether it is a good idea to promote this operation and
   // what's the right type to promote it to.
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
     LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
 
     bool Replace0 = false;
     SDValue N0 = Op.getOperand(0);
     SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
 
     bool Replace1 = false;
     SDValue N1 = Op.getOperand(1);
     SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
     SDLoc DL(Op);
 
     SDValue RV =
         DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));
 
     // We are always replacing N0/N1's use in N and only need
     // additional replacements if there are additional uses.
     Replace0 &= !N0->hasOneUse();
     Replace1 &= (N0 != N1) && !N1->hasOneUse();
 
     // Combine Op here so it is preserved past replacements.
     CombineTo(Op.getNode(), RV);
 
     // If operands have a use ordering, make sure we deal with
     // predecessor first.
     if (Replace0 && Replace1 && N0.getNode()->isPredecessorOf(N1.getNode())) {
       std::swap(N0, N1);
       std::swap(NN0, NN1);
     }
 
     if (Replace0) {
       AddToWorklist(NN0.getNode());
       ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
     }
     if (Replace1) {
       AddToWorklist(NN1.getNode());
       ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
     }
     return Op;
   }
   return SDValue();
 }
 
 /// Promote the specified integer shift operation if the target indicates it is
 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to
 /// i32 since i16 instructions are longer.
 SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
   if (!LegalOperations)
     return SDValue();
 
   EVT VT = Op.getValueType();
   if (VT.isVector() || !VT.isInteger())
     return SDValue();
 
   // If operation type is 'undesirable', e.g. i16 on x86, consider
   // promoting it.
   unsigned Opc = Op.getOpcode();
   if (TLI.isTypeDesirableForOp(Opc, VT))
     return SDValue();
 
   EVT PVT = VT;
   // Consult target whether it is a good idea to promote this operation and
   // what's the right type to promote it to.
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
     LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
 
     bool Replace = false;
     SDValue N0 = Op.getOperand(0);
     SDValue N1 = Op.getOperand(1);
     if (Opc == ISD::SRA)
       N0 = SExtPromoteOperand(N0, PVT);
     else if (Opc == ISD::SRL)
       N0 = ZExtPromoteOperand(N0, PVT);
     else
       N0 = PromoteOperand(N0, PVT, Replace);
 
     if (!N0.getNode())
       return SDValue();
 
     SDLoc DL(Op);
     SDValue RV =
         DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
 
     AddToWorklist(N0.getNode());
     if (Replace)
       ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
 
     // Deal with Op being deleted.
     if (Op && Op.getOpcode() != ISD::DELETED_NODE)
       return RV;
   }
   return SDValue();
 }
 
 SDValue DAGCombiner::PromoteExtend(SDValue Op) {
   if (!LegalOperations)
     return SDValue();
 
   EVT VT = Op.getValueType();
   if (VT.isVector() || !VT.isInteger())
     return SDValue();
 
   // If operation type is 'undesirable', e.g. i16 on x86, consider
   // promoting it.
   unsigned Opc = Op.getOpcode();
   if (TLI.isTypeDesirableForOp(Opc, VT))
     return SDValue();
 
   EVT PVT = VT;
   // Consult target whether it is a good idea to promote this operation and
   // what's the right type to promote it to.
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
     // fold (aext (aext x)) -> (aext x)
     // fold (aext (zext x)) -> (zext x)
     // fold (aext (sext x)) -> (sext x)
     LLVM_DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
     return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
   }
   return SDValue();
 }
 
 bool DAGCombiner::PromoteLoad(SDValue Op) {
   if (!LegalOperations)
     return false;
 
   if (!ISD::isUNINDEXEDLoad(Op.getNode()))
     return false;
 
   EVT VT = Op.getValueType();
   if (VT.isVector() || !VT.isInteger())
     return false;
 
   // If operation type is 'undesirable', e.g. i16 on x86, consider
   // promoting it.
   unsigned Opc = Op.getOpcode();
   if (TLI.isTypeDesirableForOp(Opc, VT))
     return false;
 
   EVT PVT = VT;
   // Consult target whether it is a good idea to promote this operation and
   // what's the right type to promote it to.
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
     SDLoc DL(Op);
     SDNode *N = Op.getNode();
     LoadSDNode *LD = cast<LoadSDNode>(N);
     EVT MemVT = LD->getMemoryVT();
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
                                                       : LD->getExtensionType();
     SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT,
                                    LD->getChain(), LD->getBasePtr(),
                                    MemVT, LD->getMemOperand());
     SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
 
     LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: ";
                Result.getNode()->dump(&DAG); dbgs() << '\n');
     WorklistRemover DeadNodes(*this);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
     deleteAndRecombine(N);
     AddToWorklist(Result.getNode());
     return true;
   }
   return false;
 }
 
 /// Recursively delete a node which has no uses and any operands for
 /// which it is the only use.
 ///
 /// Note that this both deletes the nodes and removes them from the worklist.
 /// It also adds any nodes who have had a user deleted to the worklist as they
 /// may now have only one use and subject to other combines.
 bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
   if (!N->use_empty())
     return false;
 
   SmallSetVector<SDNode *, 16> Nodes;
   Nodes.insert(N);
   do {
     N = Nodes.pop_back_val();
     if (!N)
       continue;
 
     if (N->use_empty()) {
       for (const SDValue &ChildN : N->op_values())
         Nodes.insert(ChildN.getNode());
 
       removeFromWorklist(N);
       DAG.DeleteNode(N);
     } else {
       AddToWorklist(N);
     }
   } while (!Nodes.empty());
   return true;
 }
 
 //===----------------------------------------------------------------------===//
 //  Main DAG Combiner implementation
 //===----------------------------------------------------------------------===//
 
 void DAGCombiner::Run(CombineLevel AtLevel) {
   // set the instance variables, so that the various visit routines may use it.
   Level = AtLevel;
   LegalOperations = Level >= AfterLegalizeVectorOps;
   LegalTypes = Level >= AfterLegalizeTypes;
 
   // Add all the dag nodes to the worklist.
   for (SDNode &Node : DAG.allnodes())
     AddToWorklist(&Node);
 
   // Create a dummy node (which is not added to allnodes), that adds a reference
   // to the root node, preventing it from being deleted, and tracking any
   // changes of the root.
   HandleSDNode Dummy(DAG.getRoot());
 
   // While the worklist isn't empty, find a node and try to combine it.
   while (!WorklistMap.empty()) {
     SDNode *N;
     // The Worklist holds the SDNodes in order, but it may contain null entries.
     do {
       N = Worklist.pop_back_val();
     } while (!N);
 
     bool GoodWorklistEntry = WorklistMap.erase(N);
     (void)GoodWorklistEntry;
     assert(GoodWorklistEntry &&
            "Found a worklist entry without a corresponding map entry!");
 
     // If N has no uses, it is dead.  Make sure to revisit all N's operands once
     // N is deleted from the DAG, since they too may now be dead or may have a
     // reduced number of uses, allowing other xforms.
     if (recursivelyDeleteUnusedNodes(N))
       continue;
 
     WorklistRemover DeadNodes(*this);
 
     // If this combine is running after legalizing the DAG, re-legalize any
     // nodes pulled off the worklist.
     if (Level == AfterLegalizeDAG) {
       SmallSetVector<SDNode *, 16> UpdatedNodes;
       bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
 
       for (SDNode *LN : UpdatedNodes) {
         AddToWorklist(LN);
         AddUsersToWorklist(LN);
       }
       if (!NIsValid)
         continue;
     }
 
     LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));
 
     // Add any operands of the new node which have not yet been combined to the
     // worklist as well. Because the worklist uniques things already, this
     // won't repeatedly process the same operand.
     CombinedNodes.insert(N);
     for (const SDValue &ChildN : N->op_values())
       if (!CombinedNodes.count(ChildN.getNode()))
         AddToWorklist(ChildN.getNode());
 
     SDValue RV = combine(N);
 
     if (!RV.getNode())
       continue;
 
     ++NodesCombined;
 
     // If we get back the same node we passed in, rather than a new node or
     // zero, we know that the node must have defined multiple values and
     // CombineTo was used.  Since CombineTo takes care of the worklist
     // mechanics for us, we have no work to do in this case.
     if (RV.getNode() == N)
       continue;
 
     assert(N->getOpcode() != ISD::DELETED_NODE &&
            RV.getOpcode() != ISD::DELETED_NODE &&
            "Node was deleted but visit returned new node!");
 
     LLVM_DEBUG(dbgs() << " ... into: "; RV.getNode()->dump(&DAG));
 
     if (N->getNumValues() == RV.getNode()->getNumValues())
       DAG.ReplaceAllUsesWith(N, RV.getNode());
     else {
       assert(N->getValueType(0) == RV.getValueType() &&
              N->getNumValues() == 1 && "Type mismatch");
       DAG.ReplaceAllUsesWith(N, &RV);
     }
 
     // Push the new node and any users onto the worklist
     AddToWorklist(RV.getNode());
     AddUsersToWorklist(RV.getNode());
 
     // Finally, if the node is now dead, remove it from the graph.  The node
     // may not be dead if the replacement process recursively simplified to
     // something else needing this node. This will also take care of adding any
     // operands which have lost a user to the worklist.
     recursivelyDeleteUnusedNodes(N);
   }
 
   // If the root changed (e.g. it was a dead load, update the root).
   DAG.setRoot(Dummy.getValue());
   DAG.RemoveDeadNodes();
 }
 
 SDValue DAGCombiner::visit(SDNode *N) {
   switch (N->getOpcode()) {
   default: break;
   case ISD::TokenFactor:        return visitTokenFactor(N);
   case ISD::MERGE_VALUES:       return visitMERGE_VALUES(N);
   case ISD::ADD:                return visitADD(N);
   case ISD::SUB:                return visitSUB(N);
   case ISD::ADDC:               return visitADDC(N);
   case ISD::UADDO:              return visitUADDO(N);
   case ISD::SUBC:               return visitSUBC(N);
   case ISD::USUBO:              return visitUSUBO(N);
   case ISD::ADDE:               return visitADDE(N);
   case ISD::ADDCARRY:           return visitADDCARRY(N);
   case ISD::SUBE:               return visitSUBE(N);
   case ISD::SUBCARRY:           return visitSUBCARRY(N);
   case ISD::MUL:                return visitMUL(N);
   case ISD::SDIV:               return visitSDIV(N);
   case ISD::UDIV:               return visitUDIV(N);
   case ISD::SREM:
   case ISD::UREM:               return visitREM(N);
   case ISD::MULHU:              return visitMULHU(N);
   case ISD::MULHS:              return visitMULHS(N);
   case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
   case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
   case ISD::SMULO:              return visitSMULO(N);
   case ISD::UMULO:              return visitUMULO(N);
   case ISD::SMIN:
   case ISD::SMAX:
   case ISD::UMIN:
   case ISD::UMAX:               return visitIMINMAX(N);
   case ISD::AND:                return visitAND(N);
   case ISD::OR:                 return visitOR(N);
   case ISD::XOR:                return visitXOR(N);
   case ISD::SHL:                return visitSHL(N);
   case ISD::SRA:                return visitSRA(N);
   case ISD::SRL:                return visitSRL(N);
   case ISD::ROTR:
   case ISD::ROTL:               return visitRotate(N);
   case ISD::ABS:                return visitABS(N);
   case ISD::BSWAP:              return visitBSWAP(N);
   case ISD::BITREVERSE:         return visitBITREVERSE(N);
   case ISD::CTLZ:               return visitCTLZ(N);
   case ISD::CTLZ_ZERO_UNDEF:    return visitCTLZ_ZERO_UNDEF(N);
   case ISD::CTTZ:               return visitCTTZ(N);
   case ISD::CTTZ_ZERO_UNDEF:    return visitCTTZ_ZERO_UNDEF(N);
   case ISD::CTPOP:              return visitCTPOP(N);
   case ISD::SELECT:             return visitSELECT(N);
   case ISD::VSELECT:            return visitVSELECT(N);
   case ISD::SELECT_CC:          return visitSELECT_CC(N);
   case ISD::SETCC:              return visitSETCC(N);
   case ISD::SETCCCARRY:         return visitSETCCCARRY(N);
   case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
   case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
   case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
   case ISD::AssertSext:
   case ISD::AssertZext:         return visitAssertExt(N);
   case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
   case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
   case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
   case ISD::TRUNCATE:           return visitTRUNCATE(N);
   case ISD::BITCAST:            return visitBITCAST(N);
   case ISD::BUILD_PAIR:         return visitBUILD_PAIR(N);
   case ISD::FADD:               return visitFADD(N);
   case ISD::FSUB:               return visitFSUB(N);
   case ISD::FMUL:               return visitFMUL(N);
   case ISD::FMA:                return visitFMA(N);
   case ISD::FDIV:               return visitFDIV(N);
   case ISD::FREM:               return visitFREM(N);
   case ISD::FSQRT:              return visitFSQRT(N);
   case ISD::FCOPYSIGN:          return visitFCOPYSIGN(N);
   case ISD::SINT_TO_FP:         return visitSINT_TO_FP(N);
   case ISD::UINT_TO_FP:         return visitUINT_TO_FP(N);
   case ISD::FP_TO_SINT:         return visitFP_TO_SINT(N);
   case ISD::FP_TO_UINT:         return visitFP_TO_UINT(N);
   case ISD::FP_ROUND:           return visitFP_ROUND(N);
   case ISD::FP_ROUND_INREG:     return visitFP_ROUND_INREG(N);
   case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
   case ISD::FNEG:               return visitFNEG(N);
   case ISD::FABS:               return visitFABS(N);
   case ISD::FFLOOR:             return visitFFLOOR(N);
   case ISD::FMINNUM:            return visitFMINNUM(N);
   case ISD::FMAXNUM:            return visitFMAXNUM(N);
   case ISD::FCEIL:              return visitFCEIL(N);
   case ISD::FTRUNC:             return visitFTRUNC(N);
   case ISD::BRCOND:             return visitBRCOND(N);
   case ISD::BR_CC:              return visitBR_CC(N);
   case ISD::LOAD:               return visitLOAD(N);
   case ISD::STORE:              return visitSTORE(N);
   case ISD::INSERT_VECTOR_ELT:  return visitINSERT_VECTOR_ELT(N);
   case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
   case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
   case ISD::CONCAT_VECTORS:     return visitCONCAT_VECTORS(N);
   case ISD::EXTRACT_SUBVECTOR:  return visitEXTRACT_SUBVECTOR(N);
   case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
   case ISD::SCALAR_TO_VECTOR:   return visitSCALAR_TO_VECTOR(N);
   case ISD::INSERT_SUBVECTOR:   return visitINSERT_SUBVECTOR(N);
   case ISD::MGATHER:            return visitMGATHER(N);
   case ISD::MLOAD:              return visitMLOAD(N);
   case ISD::MSCATTER:           return visitMSCATTER(N);
   case ISD::MSTORE:             return visitMSTORE(N);
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
   case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
   }
   return SDValue();
 }
 
 SDValue DAGCombiner::combine(SDNode *N) {
   SDValue RV = visit(N);
 
   // If nothing happened, try a target-specific DAG combine.
   if (!RV.getNode()) {
     assert(N->getOpcode() != ISD::DELETED_NODE &&
            "Node was deleted but visit returned NULL!");
 
     if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
         TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
 
       // Expose the DAG combiner to the target combiner impls.
       TargetLowering::DAGCombinerInfo
         DagCombineInfo(DAG, Level, false, this);
 
       RV = TLI.PerformDAGCombine(N, DagCombineInfo);
     }
   }
 
   // If nothing happened still, try promoting the operation.
   if (!RV.getNode()) {
     switch (N->getOpcode()) {
     default: break;
     case ISD::ADD:
     case ISD::SUB:
     case ISD::MUL:
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
       RV = PromoteIntBinOp(SDValue(N, 0));
       break;
     case ISD::SHL:
     case ISD::SRA:
     case ISD::SRL:
       RV = PromoteIntShiftOp(SDValue(N, 0));
       break;
     case ISD::SIGN_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::ANY_EXTEND:
       RV = PromoteExtend(SDValue(N, 0));
       break;
     case ISD::LOAD:
       if (PromoteLoad(SDValue(N, 0)))
         RV = SDValue(N, 0);
       break;
     }
   }
 
   // If N is a commutative binary node, try eliminate it if the commuted
   // version is already present in the DAG.
   if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode()) &&
       N->getNumValues() == 1) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
 
     // Constant operands are canonicalized to RHS.
     if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) {
       SDValue Ops[] = {N1, N0};
       SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
                                             N->getFlags());
       if (CSENode)
         return SDValue(CSENode, 0);
     }
   }
 
   return RV;
 }
 
 /// Given a node, return its input chain if it has one, otherwise return a null
 /// sd operand.
 static SDValue getInputChainForNode(SDNode *N) {
   if (unsigned NumOps = N->getNumOperands()) {
     if (N->getOperand(0).getValueType() == MVT::Other)
       return N->getOperand(0);
     if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
       return N->getOperand(NumOps-1);
     for (unsigned i = 1; i < NumOps-1; ++i)
       if (N->getOperand(i).getValueType() == MVT::Other)
         return N->getOperand(i);
   }
   return SDValue();
 }
 
 SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
   // If N has two operands, where one has an input chain equal to the other,
   // the 'other' chain is redundant.
   if (N->getNumOperands() == 2) {
     if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
       return N->getOperand(0);
     if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
       return N->getOperand(1);
   }
 
   // Don't simplify token factors if optnone.
   if (OptLevel == CodeGenOpt::None)
     return SDValue();
 
   SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
   SmallVector<SDValue, 8> Ops;      // Ops for replacing token factor.
   SmallPtrSet<SDNode*, 16> SeenOps;
   bool Changed = false;             // If we should replace this token factor.
 
   // Start out with this token factor.
   TFs.push_back(N);
 
   // Iterate through token factors.  The TFs grows when new token factors are
   // encountered.
   for (unsigned i = 0; i < TFs.size(); ++i) {
     SDNode *TF = TFs[i];
 
     // Check each of the operands.
     for (const SDValue &Op : TF->op_values()) {
       switch (Op.getOpcode()) {
       case ISD::EntryToken:
         // Entry tokens don't need to be added to the list. They are
         // redundant.
         Changed = true;
         break;
 
       case ISD::TokenFactor:
         if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) {
           // Queue up for processing.
           TFs.push_back(Op.getNode());
           // Clean up in case the token factor is removed.
           AddToWorklist(Op.getNode());
           Changed = true;
           break;
         }
         LLVM_FALLTHROUGH;
 
       default:
         // Only add if it isn't already in the list.
         if (SeenOps.insert(Op.getNode()).second)
           Ops.push_back(Op);
         else
           Changed = true;
         break;
       }
     }
   }
 
   // Remove Nodes that are chained to another node in the list. Do so
   // by walking up chains breath-first stopping when we've seen
   // another operand. In general we must climb to the EntryNode, but we can exit
   // early if we find all remaining work is associated with just one operand as
   // no further pruning is possible.
 
   // List of nodes to search through and original Ops from which they originate.
   SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
   SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
   SmallPtrSet<SDNode *, 16> SeenChains;
   bool DidPruneOps = false;
 
   unsigned NumLeftToConsider = 0;
   for (const SDValue &Op : Ops) {
     Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
     OpWorkCount.push_back(1);
   }
 
   auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
     // If this is an Op, we can remove the op from the list. Remark any
     // search associated with it as from the current OpNumber.
     if (SeenOps.count(Op) != 0) {
       Changed = true;
       DidPruneOps = true;
       unsigned OrigOpNumber = 0;
       while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
         OrigOpNumber++;
       assert((OrigOpNumber != Ops.size()) &&
              "expected to find TokenFactor Operand");
       // Re-mark worklist from OrigOpNumber to OpNumber
       for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
         if (Worklist[i].second == OrigOpNumber) {
           Worklist[i].second = OpNumber;
         }
       }
       OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
       OpWorkCount[OrigOpNumber] = 0;
       NumLeftToConsider--;
     }
     // Add if it's a new chain
     if (SeenChains.insert(Op).second) {
       OpWorkCount[OpNumber]++;
       Worklist.push_back(std::make_pair(Op, OpNumber));
     }
   };
 
   for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
     // We need at least be consider at least 2 Ops to prune.
     if (NumLeftToConsider <= 1)
       break;
     auto CurNode = Worklist[i].first;
     auto CurOpNumber = Worklist[i].second;
     assert((OpWorkCount[CurOpNumber] > 0) &&
            "Node should not appear in worklist");
     switch (CurNode->getOpcode()) {
     case ISD::EntryToken:
       // Hitting EntryToken is the only way for the search to terminate without
       // hitting
       // another operand's search. Prevent us from marking this operand
       // considered.
       NumLeftToConsider++;
       break;
     case ISD::TokenFactor:
       for (const SDValue &Op : CurNode->op_values())
         AddToWorklist(i, Op.getNode(), CurOpNumber);
       break;
     case ISD::CopyFromReg:
     case ISD::CopyToReg:
       AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
       break;
     default:
       if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
         AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
       break;
     }
     OpWorkCount[CurOpNumber]--;
     if (OpWorkCount[CurOpNumber] == 0)
       NumLeftToConsider--;
   }
 
   // If we've changed things around then replace token factor.
   if (Changed) {
     SDValue Result;
     if (Ops.empty()) {
       // The entry token is the only possible outcome.
       Result = DAG.getEntryNode();
     } else {
       if (DidPruneOps) {
         SmallVector<SDValue, 8> PrunedOps;
         //
         for (const SDValue &Op : Ops) {
           if (SeenChains.count(Op.getNode()) == 0)
             PrunedOps.push_back(Op);
         }
         Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps);
       } else {
         Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
       }
     }
     return Result;
   }
   return SDValue();
 }
 
 /// MERGE_VALUES can always be eliminated.
 SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
   WorklistRemover DeadNodes(*this);
   // Replacing results may cause a different MERGE_VALUES to suddenly
   // be CSE'd with N, and carry its uses with it. Iterate until no
   // uses remain, to ensure that the node can be safely deleted.
   // First add the users of this node to the work list so that they
   // can be tried again once they have new operands.
   AddUsersToWorklist(N);
   do {
     for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
       DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i));
   } while (!N->use_empty());
   deleteAndRecombine(N);
   return SDValue(N, 0);   // Return N so it doesn't get rechecked!
 }
 
 /// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a
 /// ConstantSDNode pointer else nullptr.
 static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N);
   return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
 }
 
 SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
   auto BinOpcode = BO->getOpcode();
   assert((BinOpcode == ISD::ADD || BinOpcode == ISD::SUB ||
           BinOpcode == ISD::MUL || BinOpcode == ISD::SDIV ||
           BinOpcode == ISD::UDIV || BinOpcode == ISD::SREM ||
           BinOpcode == ISD::UREM || BinOpcode == ISD::AND ||
           BinOpcode == ISD::OR || BinOpcode == ISD::XOR ||
           BinOpcode == ISD::SHL || BinOpcode == ISD::SRL ||
           BinOpcode == ISD::SRA || BinOpcode == ISD::FADD ||
           BinOpcode == ISD::FSUB || BinOpcode == ISD::FMUL ||
           BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) &&
          "Unexpected binary operator");
 
   // Don't do this unless the old select is going away. We want to eliminate the
   // binary operator, not replace a binop with a select.
   // TODO: Handle ISD::SELECT_CC.
   unsigned SelOpNo = 0;
   SDValue Sel = BO->getOperand(0);
   if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) {
     SelOpNo = 1;
     Sel = BO->getOperand(1);
   }
 
   if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse())
     return SDValue();
 
   SDValue CT = Sel.getOperand(1);
   if (!isConstantOrConstantVector(CT, true) &&
       !isConstantFPBuildVectorOrConstantFP(CT))
     return SDValue();
 
   SDValue CF = Sel.getOperand(2);
   if (!isConstantOrConstantVector(CF, true) &&
       !isConstantFPBuildVectorOrConstantFP(CF))
     return SDValue();
 
   // Bail out if any constants are opaque because we can't constant fold those.
   // The exception is "and" and "or" with either 0 or -1 in which case we can
   // propagate non constant operands into select. I.e.:
   // and (select Cond, 0, -1), X --> select Cond, 0, X
   // or X, (select Cond, -1, 0) --> select Cond, -1, X
   bool CanFoldNonConst = (BinOpcode == ISD::AND || BinOpcode == ISD::OR) &&
                          (isNullConstantOrNullSplatConstant(CT) ||
                           isAllOnesConstantOrAllOnesSplatConstant(CT)) &&
                          (isNullConstantOrNullSplatConstant(CF) ||
                           isAllOnesConstantOrAllOnesSplatConstant(CF));
 
   SDValue CBO = BO->getOperand(SelOpNo ^ 1);
   if (!CanFoldNonConst &&
       !isConstantOrConstantVector(CBO, true) &&
       !isConstantFPBuildVectorOrConstantFP(CBO))
     return SDValue();
 
   EVT VT = Sel.getValueType();
 
   // In case of shift value and shift amount may have different VT. For instance
   // on x86 shift amount is i8 regardles of LHS type. Bail out if we have
   // swapped operands and value types do not match. NB: x86 is fine if operands
   // are not swapped with shift amount VT being not bigger than shifted value.
   // TODO: that is possible to check for a shift operation, correct VTs and
   // still perform optimization on x86 if needed.
   if (SelOpNo && VT != CBO.getValueType())
     return SDValue();
 
   // We have a select-of-constants followed by a binary operator with a
   // constant. Eliminate the binop by pulling the constant math into the select.
   // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO
   SDLoc DL(Sel);
   SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT)
                           : DAG.getNode(BinOpcode, DL, VT, CT, CBO);
   if (!CanFoldNonConst && !NewCT.isUndef() &&
       !isConstantOrConstantVector(NewCT, true) &&
       !isConstantFPBuildVectorOrConstantFP(NewCT))
     return SDValue();
 
   SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF)
                           : DAG.getNode(BinOpcode, DL, VT, CF, CBO);
   if (!CanFoldNonConst && !NewCF.isUndef() &&
       !isConstantOrConstantVector(NewCF, true) &&
       !isConstantFPBuildVectorOrConstantFP(NewCF))
     return SDValue();
 
   return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
 }
 
 static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
          "Expecting add or sub");
 
   // Match a constant operand and a zext operand for the math instruction:
   // add Z, C
   // sub C, Z
   bool IsAdd = N->getOpcode() == ISD::ADD;
   SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0);
   SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1);
   auto *CN = dyn_cast<ConstantSDNode>(C);
   if (!CN || Z.getOpcode() != ISD::ZERO_EXTEND)
     return SDValue();
 
   // Match the zext operand as a setcc of a boolean.
   if (Z.getOperand(0).getOpcode() != ISD::SETCC ||
       Z.getOperand(0).getValueType() != MVT::i1)
     return SDValue();
 
   // Match the compare as: setcc (X & 1), 0, eq.
   SDValue SetCC = Z.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
   if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) ||
       SetCC.getOperand(0).getOpcode() != ISD::AND ||
       !isOneConstant(SetCC.getOperand(0).getOperand(1)))
     return SDValue();
 
   // We are adding/subtracting a constant and an inverted low bit. Turn that
   // into a subtract/add of the low bit with incremented/decremented constant:
   // add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1))
   // sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1))
   EVT VT = C.getValueType();
   SDLoc DL(N);
   SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT);
   SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) :
                        DAG.getConstant(CN->getAPIntValue() - 1, DL, VT);
   return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
 }
 
 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
 /// a shift and add with a different constant.
 static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
   assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
          "Expecting add or sub");
 
   // We need a constant operand for the add/sub, and the other operand is a
   // logical shift right: add (srl), C or sub C, (srl).
   bool IsAdd = N->getOpcode() == ISD::ADD;
   SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0);
   SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1);
   ConstantSDNode *C = isConstOrConstSplat(ConstantOp);
   if (!C || ShiftOp.getOpcode() != ISD::SRL)
     return SDValue();
 
   // The shift must be of a 'not' value.
   // TODO: Use isBitwiseNot() if it works with vectors.
   SDValue Not = ShiftOp.getOperand(0);
   if (!Not.hasOneUse() || Not.getOpcode() != ISD::XOR ||
       !isAllOnesConstantOrAllOnesSplatConstant(Not.getOperand(1)))
     return SDValue();
 
   // The shift must be moving the sign bit to the least-significant-bit.
   EVT VT = ShiftOp.getValueType();
   SDValue ShAmt = ShiftOp.getOperand(1);
   ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
   if (!ShAmtC || ShAmtC->getZExtValue() != VT.getScalarSizeInBits() - 1)
     return SDValue();
 
   // Eliminate the 'not' by adjusting the shift and add/sub constant:
   // add (srl (not X), 31), C --> add (sra X, 31), (C + 1)
   // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1)
   SDLoc DL(N);
   auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL;
   SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt);
   APInt NewC = IsAdd ? C->getAPIntValue() + 1 : C->getAPIntValue() - 1;
   return DAG.getNode(ISD::ADD, DL, VT, NewShift, DAG.getConstant(NewC, DL, VT));
 }
 
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
     // fold (add x, 0) -> x, vector edition
     if (ISD::isBuildVectorAllZeros(N1.getNode()))
       return N0;
     if (ISD::isBuildVectorAllZeros(N0.getNode()))
       return N1;
   }
 
   // fold (add x, undef) -> undef
   if (N0.isUndef())
     return N0;
 
   if (N1.isUndef())
     return N1;
 
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
     // canonicalize constant to RHS
     if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
       return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
     // fold (add c1, c2) -> c1+c2
     return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, N0.getNode(),
                                       N1.getNode());
   }
 
   // fold (add x, 0) -> x
   if (isNullConstant(N1))
     return N0;
 
   if (isConstantOrConstantVector(N1, /* NoOpaque */ true)) {
     // fold ((c1-A)+c2) -> (c1+c2)-A
     if (N0.getOpcode() == ISD::SUB &&
         isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) {
       // FIXME: Adding 2 constants should be handled by FoldConstantArithmetic.
       return DAG.getNode(ISD::SUB, DL, VT,
                          DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)),
                          N0.getOperand(1));
     }
 
     // add (sext i1 X), 1 -> zext (not i1 X)
     // We don't transform this pattern:
     //   add (zext i1 X), -1 -> sext (not i1 X)
     // because most (?) targets generate better code for the zext form.
     if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
         isOneConstantOrOneSplatConstant(N1)) {
       SDValue X = N0.getOperand(0);
       if ((!LegalOperations ||
            (TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
             TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
           X.getScalarValueSizeInBits() == 1) {
         SDValue Not = DAG.getNOT(DL, X, X.getValueType());
         return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
       }
     }
 
     // Undo the add -> or combine to merge constant offsets from a frame index.
     if (N0.getOpcode() == ISD::OR &&
         isa<FrameIndexSDNode>(N0.getOperand(0)) &&
         isa<ConstantSDNode>(N0.getOperand(1)) &&
         DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {
       SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1));
       return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);
     }
   }
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // reassociate add
   if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1))
     return RADD;
 
   // fold ((0-A) + B) -> B-A
   if (N0.getOpcode() == ISD::SUB &&
       isNullConstantOrNullSplatConstant(N0.getOperand(0)))
     return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
 
   // fold (A + (0-B)) -> A-B
   if (N1.getOpcode() == ISD::SUB &&
       isNullConstantOrNullSplatConstant(N1.getOperand(0)))
     return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1));
 
   // fold (A+(B-A)) -> B
   if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
     return N1.getOperand(0);
 
   // fold ((B-A)+A) -> B
   if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
     return N0.getOperand(0);
 
   // fold (A+(B-(A+C))) to (B-C)
   if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
       N0 == N1.getOperand(1).getOperand(0))
     return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
                        N1.getOperand(1).getOperand(1));
 
   // fold (A+(B-(C+A))) to (B-C)
   if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
       N0 == N1.getOperand(1).getOperand(1))
     return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
                        N1.getOperand(1).getOperand(0));
 
   // fold (A+((B-A)+or-C)) to (B+or-C)
   if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) &&
       N1.getOperand(0).getOpcode() == ISD::SUB &&
       N0 == N1.getOperand(0).getOperand(1))
     return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0),
                        N1.getOperand(1));
 
   // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
   if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) {
     SDValue N00 = N0.getOperand(0);
     SDValue N01 = N0.getOperand(1);
     SDValue N10 = N1.getOperand(0);
     SDValue N11 = N1.getOperand(1);
 
     if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10))
       return DAG.getNode(ISD::SUB, DL, VT,
                          DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10),
                          DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
   }
 
   if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
     return V;
 
   if (SDValue V = foldAddSubOfSignBit(N, DAG))
     return V;
 
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // fold (a+b) -> (a|b) iff a and b share no bits.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
       DAG.haveNoCommonBitsSet(N0, N1))
     return DAG.getNode(ISD::OR, DL, VT, N0, N1);
 
   // fold (add (xor a, -1), 1) -> (sub 0, a)
   if (isBitwiseNot(N0) && isOneConstantOrOneSplatConstant(N1))
     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                        N0.getOperand(0));
 
   if (SDValue Combined = visitADDLike(N0, N1, N))
     return Combined;
 
   if (SDValue Combined = visitADDLike(N1, N0, N))
     return Combined;
 
   return SDValue();
 }
 
 static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) {
   bool Masked = false;
 
   // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization.
   while (true) {
     if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) {
       V = V.getOperand(0);
       continue;
     }
 
     if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) {
       Masked = true;
       V = V.getOperand(0);
       continue;
     }
 
     break;
   }
 
   // If this is not a carry, return.
   if (V.getResNo() != 1)
     return SDValue();
 
   if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY &&
       V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
     return SDValue();
 
   // If the result is masked, then no matter what kind of bool it is we can
   // return. If it isn't, then we need to make sure the bool type is either 0 or
   // 1 and not other values.
   if (Masked ||
       TLI.getBooleanContents(V.getValueType()) ==
           TargetLoweringBase::ZeroOrOneBooleanContent)
     return V;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) {
   EVT VT = N0.getValueType();
   SDLoc DL(LocReference);
 
   // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
   if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
       isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0)))
     return DAG.getNode(ISD::SUB, DL, VT, N0,
                        DAG.getNode(ISD::SHL, DL, VT,
                                    N1.getOperand(0).getOperand(1),
                                    N1.getOperand(1)));
 
   if (N1.getOpcode() == ISD::AND) {
     SDValue AndOp0 = N1.getOperand(0);
     unsigned NumSignBits = DAG.ComputeNumSignBits(AndOp0);
     unsigned DestBits = VT.getScalarSizeInBits();
 
     // (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x))
     // and similar xforms where the inner op is either ~0 or 0.
     if (NumSignBits == DestBits &&
         isOneConstantOrOneSplatConstant(N1->getOperand(1)))
       return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0);
   }
 
   // add (sext i1), X -> sub X, (zext i1)
   if (N0.getOpcode() == ISD::SIGN_EXTEND &&
       N0.getOperand(0).getValueType() == MVT::i1 &&
       !TLI.isOperationLegal(ISD::SIGN_EXTEND, MVT::i1)) {
     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
     return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
   }
 
   // add X, (sextinreg Y i1) -> sub X, (and Y 1)
   if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
     VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
     if (TN->getVT() == MVT::i1) {
       SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
                                  DAG.getConstant(1, DL, VT));
       return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
     }
   }
 
   // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
   if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) &&
       N1.getResNo() == 0)
     return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(),
                        N0, N1.getOperand(0), N1.getOperand(2));
 
   // (add X, Carry) -> (addcarry X, 0, Carry)
   if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
     if (SDValue Carry = getAsCarry(TLI, N1))
       return DAG.getNode(ISD::ADDCARRY, DL,
                          DAG.getVTList(VT, Carry.getValueType()), N0,
                          DAG.getConstant(0, DL, VT), Carry);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitADDC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
   // If the flag result is dead, turn this into an ADD.
   if (!N->hasAnyUseOfValue(1))
     return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
                      DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   // canonicalize constant to RHS.
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
     return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);
 
   // fold (addc x, 0) -> x + no carry out
   if (isNullConstant(N1))
     return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
                                         DL, MVT::Glue));
 
   // If it cannot overflow, transform into an add.
   if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
     return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
                      DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   return SDValue();
 }
 
 static SDValue flipBoolean(SDValue V, const SDLoc &DL, EVT VT,
                            SelectionDAG &DAG, const TargetLowering &TLI) {
   SDValue Cst;
   switch (TLI.getBooleanContents(VT)) {
   case TargetLowering::ZeroOrOneBooleanContent:
   case TargetLowering::UndefinedBooleanContent:
     Cst = DAG.getConstant(1, DL, VT);
     break;
   case TargetLowering::ZeroOrNegativeOneBooleanContent:
     Cst = DAG.getConstant(-1, DL, VT);
     break;
   }
 
   return DAG.getNode(ISD::XOR, DL, VT, V, Cst);
 }
 
 static bool isBooleanFlip(SDValue V, EVT VT, const TargetLowering &TLI) {
   if (V.getOpcode() != ISD::XOR) return false;
   ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V.getOperand(1));
   if (!Const) return false;
 
   switch(TLI.getBooleanContents(VT)) {
     case TargetLowering::ZeroOrOneBooleanContent:
       return Const->isOne();
     case TargetLowering::ZeroOrNegativeOneBooleanContent:
       return Const->isAllOnesValue();
     case TargetLowering::UndefinedBooleanContent:
       return (Const->getAPIntValue() & 0x01) == 1;
   }
   llvm_unreachable("Unsupported boolean content");
 }
 
 SDValue DAGCombiner::visitUADDO(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   if (VT.isVector())
     return SDValue();
 
   EVT CarryVT = N->getValueType(1);
   SDLoc DL(N);
 
   // If the flag result is dead, turn this into an ADD.
   if (!N->hasAnyUseOfValue(1))
     return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
                      DAG.getUNDEF(CarryVT));
 
   // canonicalize constant to RHS.
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
     return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0);
 
   // fold (uaddo x, 0) -> x + no carry out
   if (isNullConstant(N1))
     return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
 
   // If it cannot overflow, transform into an add.
   if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
     return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
                      DAG.getConstant(0, DL, CarryVT));
 
   // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry.
   if (isBitwiseNot(N0) && isOneConstantOrOneSplatConstant(N1)) {
     SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(),
                               DAG.getConstant(0, DL, VT),
                               N0.getOperand(0));
     return CombineTo(N, Sub,
                      flipBoolean(Sub.getValue(1), DL, CarryVT, DAG, TLI));
   }
 
   if (SDValue Combined = visitUADDOLike(N0, N1, N))
     return Combined;
 
   if (SDValue Combined = visitUADDOLike(N1, N0, N))
     return Combined;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) {
   auto VT = N0.getValueType();
 
   // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
   // If Y + 1 cannot overflow.
   if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) {
     SDValue Y = N1.getOperand(0);
     SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType());
     if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never)
       return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y,
                          N1.getOperand(2));
   }
 
   // (uaddo X, Carry) -> (addcarry X, 0, Carry)
   if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
     if (SDValue Carry = getAsCarry(TLI, N1))
       return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
                          DAG.getConstant(0, SDLoc(N), VT), Carry);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitADDE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue CarryIn = N->getOperand(2);
 
   // canonicalize constant to RHS
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
     return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
                        N1, N0, CarryIn);
 
   // fold (adde x, y, false) -> (addc x, y)
   if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
     return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue CarryIn = N->getOperand(2);
   SDLoc DL(N);
 
   // canonicalize constant to RHS
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
     return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
 
   // fold (addcarry x, y, false) -> (uaddo x, y)
   if (isNullConstant(CarryIn)) {
     if (!LegalOperations ||
         TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0)))
       return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
   }
 
   EVT CarryVT = CarryIn.getValueType();
 
   // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
   if (isNullConstant(N0) && isNullConstant(N1)) {
     EVT VT = N0.getValueType();
     SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
     AddToWorklist(CarryExt.getNode());
     return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
                                     DAG.getConstant(1, DL, VT)),
                      DAG.getConstant(0, DL, CarryVT));
   }
 
   // fold (addcarry (xor a, -1), 0, !b) -> (subcarry 0, a, b) and flip carry.
   if (isBitwiseNot(N0) && isNullConstant(N1) &&
       isBooleanFlip(CarryIn, CarryVT, TLI)) {
     SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(),
                               DAG.getConstant(0, DL, N0.getValueType()),
                               N0.getOperand(0), CarryIn.getOperand(0));
     return CombineTo(N, Sub,
                      flipBoolean(Sub.getValue(1), DL, CarryVT, DAG, TLI));
   }
 
   if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
     return Combined;
 
   if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N))
     return Combined;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
                                        SDNode *N) {
   // Iff the flag result is dead:
   // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry)
   if ((N0.getOpcode() == ISD::ADD ||
        (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0)) &&
       isNullConstant(N1) && !N->hasAnyUseOfValue(1))
     return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
                        N0.getOperand(0), N0.getOperand(1), CarryIn);
 
   /**
    * When one of the addcarry argument is itself a carry, we may be facing
    * a diamond carry propagation. In which case we try to transform the DAG
    * to ensure linear carry propagation if that is possible.
    *
    * We are trying to get:
    *   (addcarry X, 0, (addcarry A, B, Z):Carry)
    */
   if (auto Y = getAsCarry(TLI, N1)) {
     /**
      *            (uaddo A, B)
      *             /       \
      *          Carry      Sum
      *            |          \
      *            | (addcarry *, 0, Z)
      *            |       /
      *             \   Carry
      *              |   /
      * (addcarry X, *, *)
      */
     if (Y.getOpcode() == ISD::UADDO &&
         CarryIn.getResNo() == 1 &&
         CarryIn.getOpcode() == ISD::ADDCARRY &&
         isNullConstant(CarryIn.getOperand(1)) &&
         CarryIn.getOperand(0) == Y.getValue(0)) {
       auto NewY = DAG.getNode(ISD::ADDCARRY, SDLoc(N), Y->getVTList(),
                               Y.getOperand(0), Y.getOperand(1),
                               CarryIn.getOperand(2));
       AddToWorklist(NewY.getNode());
       return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
                          DAG.getConstant(0, SDLoc(N), N0.getValueType()),
                          NewY.getValue(1));
     }
   }
 
   return SDValue();
 }
 
 // Since it may not be valid to emit a fold to zero for vector initializers
 // check if we can before folding.
 static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
                              SelectionDAG &DAG, bool LegalOperations,
                              bool LegalTypes) {
   if (!VT.isVector())
     return DAG.getConstant(0, DL, VT);
   if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
     return DAG.getConstant(0, DL, VT);
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSUB(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
     // fold (sub x, 0) -> x, vector edition
     if (ISD::isBuildVectorAllZeros(N1.getNode()))
       return N0;
   }
 
   // fold (sub x, x) -> 0
   // FIXME: Refactor this and xor and other similar operations together.
   if (N0 == N1)
     return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations, LegalTypes);
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
     // fold (sub c1, c2) -> c1-c2
     return DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N0.getNode(),
                                       N1.getNode());
   }
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
 
   // fold (sub x, c) -> (add x, -c)
   if (N1C) {
     return DAG.getNode(ISD::ADD, DL, VT, N0,
                        DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
   }
 
   if (isNullConstantOrNullSplatConstant(N0)) {
     unsigned BitWidth = VT.getScalarSizeInBits();
     // Right-shifting everything out but the sign bit followed by negation is
     // the same as flipping arithmetic/logical shift type without the negation:
     // -(X >>u 31) -> (X >>s 31)
     // -(X >>s 31) -> (X >>u 31)
     if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) {
       ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1));
       if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1) {
         auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA;
         if (!LegalOperations || TLI.isOperationLegal(NewSh, VT))
           return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1));
       }
     }
 
     // 0 - X --> 0 if the sub is NUW.
     if (N->getFlags().hasNoUnsignedWrap())
       return N0;
 
     if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) {
       // N1 is either 0 or the minimum signed value. If the sub is NSW, then
       // N1 must be 0 because negating the minimum signed value is undefined.
       if (N->getFlags().hasNoSignedWrap())
         return N0;
 
       // 0 - X --> X if X is 0 or the minimum signed value.
       return N1;
     }
   }
 
   // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
   if (isAllOnesConstantOrAllOnesSplatConstant(N0))
     return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
 
   // fold (A - (0-B)) -> A+B
   if (N1.getOpcode() == ISD::SUB &&
       isNullConstantOrNullSplatConstant(N1.getOperand(0)))
     return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1));
 
   // fold A-(A-B) -> B
   if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
     return N1.getOperand(1);
 
   // fold (A+B)-A -> B
   if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
     return N0.getOperand(1);
 
   // fold (A+B)-B -> A
   if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
     return N0.getOperand(0);
 
   // fold C2-(A+C1) -> (C2-C1)-A
   if (N1.getOpcode() == ISD::ADD) {
     SDValue N11 = N1.getOperand(1);
     if (isConstantOrConstantVector(N0, /* NoOpaques */ true) &&
         isConstantOrConstantVector(N11, /* NoOpaques */ true)) {
       SDValue NewC = DAG.getNode(ISD::SUB, DL, VT, N0, N11);
       return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));
     }
   }
 
   // fold ((A+(B+or-C))-B) -> A+or-C
   if (N0.getOpcode() == ISD::ADD &&
       (N0.getOperand(1).getOpcode() == ISD::SUB ||
        N0.getOperand(1).getOpcode() == ISD::ADD) &&
       N0.getOperand(1).getOperand(0) == N1)
     return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0),
                        N0.getOperand(1).getOperand(1));
 
   // fold ((A+(C+B))-B) -> A+C
   if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD &&
       N0.getOperand(1).getOperand(1) == N1)
     return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0),
                        N0.getOperand(1).getOperand(0));
 
   // fold ((A-(B-C))-C) -> A-B
   if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB &&
       N0.getOperand(1).getOperand(1) == N1)
     return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
                        N0.getOperand(1).getOperand(0));
 
   // fold (A-(B-C)) -> A+(C-B)
   if (N1.getOpcode() == ISD::SUB && N1.hasOneUse())
     return DAG.getNode(ISD::ADD, DL, VT, N0,
                        DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1),
                                    N1.getOperand(0)));
 
   // fold (X - (-Y * Z)) -> (X + (Y * Z))
   if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
     if (N1.getOperand(0).getOpcode() == ISD::SUB &&
         isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0))) {
       SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
                                 N1.getOperand(0).getOperand(1),
                                 N1.getOperand(1));
       return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
     }
     if (N1.getOperand(1).getOpcode() == ISD::SUB &&
         isNullConstantOrNullSplatConstant(N1.getOperand(1).getOperand(0))) {
       SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
                                 N1.getOperand(0),
                                 N1.getOperand(1).getOperand(1));
       return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
     }
   }
 
   // If either operand of a sub is undef, the result is undef
   if (N0.isUndef())
     return N0;
   if (N1.isUndef())
     return N1;
 
   if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
     return V;
 
   if (SDValue V = foldAddSubOfSignBit(N, DAG))
     return V;
 
   // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X)
   if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
     if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) {
       SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1);
       SDValue S0 = N1.getOperand(0);
       if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) {
         unsigned OpSizeInBits = VT.getScalarSizeInBits();
         if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
           if (C->getAPIntValue() == (OpSizeInBits - 1))
             return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
       }
     }
   }
 
   // If the relocation model supports it, consider symbol offsets.
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
     if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
       // fold (sub Sym, c) -> Sym-c
       if (N1C && GA->getOpcode() == ISD::GlobalAddress)
         return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
                                     GA->getOffset() -
                                         (uint64_t)N1C->getSExtValue());
       // fold (sub Sym+c1, Sym+c2) -> c1-c2
       if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
         if (GA->getGlobal() == GB->getGlobal())
           return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
                                  DL, VT);
     }
 
   // sub X, (sextinreg Y i1) -> add X, (and Y 1)
   if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
     VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
     if (TN->getVT() == MVT::i1) {
       SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
                                  DAG.getConstant(1, DL, VT));
       return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
     }
   }
 
   // Prefer an add for more folding potential and possibly better codegen:
   // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1)
   if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) {
     SDValue ShAmt = N1.getOperand(1);
     ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
     if (ShAmtC && ShAmtC->getZExtValue() == N1.getScalarValueSizeInBits() - 1) {
       SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt);
       return DAG.getNode(ISD::ADD, DL, VT, N0, SRA);
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSUBC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   SDLoc DL(N);
 
   // If the flag result is dead, turn this into an SUB.
   if (!N->hasAnyUseOfValue(1))
     return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
                      DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   // fold (subc x, x) -> 0 + no borrow
   if (N0 == N1)
     return CombineTo(N, DAG.getConstant(0, DL, VT),
                      DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   // fold (subc x, 0) -> x + no borrow
   if (isNullConstant(N1))
     return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
   if (isAllOnesConstant(N0))
     return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
                      DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitUSUBO(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   if (VT.isVector())
     return SDValue();
 
   EVT CarryVT = N->getValueType(1);
   SDLoc DL(N);
 
   // If the flag result is dead, turn this into an SUB.
   if (!N->hasAnyUseOfValue(1))
     return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
                      DAG.getUNDEF(CarryVT));
 
   // fold (usubo x, x) -> 0 + no borrow
   if (N0 == N1)
     return CombineTo(N, DAG.getConstant(0, DL, VT),
                      DAG.getConstant(0, DL, CarryVT));
 
   // fold (usubo x, 0) -> x + no borrow
   if (isNullConstant(N1))
     return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
 
   // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
   if (isAllOnesConstant(N0))
     return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
                      DAG.getConstant(0, DL, CarryVT));
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSUBE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue CarryIn = N->getOperand(2);
 
   // fold (sube x, y, false) -> (subc x, y)
   if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
     return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue CarryIn = N->getOperand(2);
 
   // fold (subcarry x, y, false) -> (usubo x, y)
   if (isNullConstant(CarryIn)) {
     if (!LegalOperations ||
         TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0)))
       return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
 
   // fold (mul x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
 
   bool N0IsConst = false;
   bool N1IsConst = false;
   bool N1IsOpaqueConst = false;
   bool N0IsOpaqueConst = false;
   APInt ConstValue0, ConstValue1;
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
     N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0);
     N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
     assert((!N0IsConst ||
             ConstValue0.getBitWidth() == VT.getScalarSizeInBits()) &&
            "Splat APInt should be element width");
     assert((!N1IsConst ||
             ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&
            "Splat APInt should be element width");
   } else {
     N0IsConst = isa<ConstantSDNode>(N0);
     if (N0IsConst) {
       ConstValue0 = cast<ConstantSDNode>(N0)->getAPIntValue();
       N0IsOpaqueConst = cast<ConstantSDNode>(N0)->isOpaque();
     }
     N1IsConst = isa<ConstantSDNode>(N1);
     if (N1IsConst) {
       ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();
       N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque();
     }
   }
 
   // fold (mul c1, c2) -> c1*c2
   if (N0IsConst && N1IsConst && !N0IsOpaqueConst && !N1IsOpaqueConst)
     return DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT,
                                       N0.getNode(), N1.getNode());
 
   // canonicalize constant to RHS (vector doesn't have to splat)
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
   // fold (mul x, 0) -> 0
   if (N1IsConst && ConstValue1.isNullValue())
     return N1;
   // fold (mul x, 1) -> x
   if (N1IsConst && ConstValue1.isOneValue())
     return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // fold (mul x, -1) -> 0-x
   if (N1IsConst && ConstValue1.isAllOnesValue()) {
     SDLoc DL(N);
     return DAG.getNode(ISD::SUB, DL, VT,
                        DAG.getConstant(0, DL, VT), N0);
   }
   // fold (mul x, (1 << c)) -> x << c
   if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
       DAG.isKnownToBeAPowerOfTwo(N1) &&
       (!VT.isVector() || Level <= AfterLegalizeVectorOps)) {
     SDLoc DL(N);
     SDValue LogBase2 = BuildLogBase2(N1, DL);
     EVT ShiftVT = getShiftAmountTy(N0.getValueType());
     SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
     return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
   }
   // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
   if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) {
     unsigned Log2Val = (-ConstValue1).logBase2();
     SDLoc DL(N);
     // FIXME: If the input is something that is easily negated (e.g. a
     // single-use add), we should put the negate there.
     return DAG.getNode(ISD::SUB, DL, VT,
                        DAG.getConstant(0, DL, VT),
                        DAG.getNode(ISD::SHL, DL, VT, N0,
                             DAG.getConstant(Log2Val, DL,
                                       getShiftAmountTy(N0.getValueType()))));
   }
 
   // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
   if (N0.getOpcode() == ISD::SHL &&
       isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
     SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1));
     if (isConstantOrConstantVector(C3))
       return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3);
   }
 
   // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
   // use.
   {
     SDValue Sh(nullptr, 0), Y(nullptr, 0);
 
     // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
     if (N0.getOpcode() == ISD::SHL &&
         isConstantOrConstantVector(N0.getOperand(1)) &&
         N0.getNode()->hasOneUse()) {
       Sh = N0; Y = N1;
     } else if (N1.getOpcode() == ISD::SHL &&
                isConstantOrConstantVector(N1.getOperand(1)) &&
                N1.getNode()->hasOneUse()) {
       Sh = N1; Y = N0;
     }
 
     if (Sh.getNode()) {
       SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y);
       return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1));
     }
   }
 
   // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
   if (DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
       N0.getOpcode() == ISD::ADD &&
       DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
       isMulAddWithConstProfitable(N, N0, N1))
       return DAG.getNode(ISD::ADD, SDLoc(N), VT,
                          DAG.getNode(ISD::MUL, SDLoc(N0), VT,
                                      N0.getOperand(0), N1),
                          DAG.getNode(ISD::MUL, SDLoc(N1), VT,
                                      N0.getOperand(1), N1));
 
   // reassociate mul
   if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1))
     return RMUL;
 
   return SDValue();
 }
 
 /// Return true if divmod libcall is available.
 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
                                      const TargetLowering &TLI) {
   RTLIB::Libcall LC;
   EVT NodeType = Node->getValueType(0);
   if (!NodeType.isSimple())
     return false;
   switch (NodeType.getSimpleVT().SimpleTy) {
   default: return false; // No libcall for vector types.
   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
   case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
   case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
   case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
   }
 
   return TLI.getLibcallName(LC) != nullptr;
 }
 
 /// Issue divrem if both quotient and remainder are needed.
 SDValue DAGCombiner::useDivRem(SDNode *Node) {
   if (Node->use_empty())
     return SDValue(); // This is a dead node, leave it alone.
 
   unsigned Opcode = Node->getOpcode();
   bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM);
   unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
 
   // DivMod lib calls can still work on non-legal types if using lib-calls.
   EVT VT = Node->getValueType(0);
   if (VT.isVector() || !VT.isInteger())
     return SDValue();
 
   if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
     return SDValue();
 
   // If DIVREM is going to get expanded into a libcall,
   // but there is no libcall available, then don't combine.
   if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
       !isDivRemLibcallAvailable(Node, isSigned, TLI))
     return SDValue();
 
   // If div is legal, it's better to do the normal expansion
   unsigned OtherOpcode = 0;
   if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) {
     OtherOpcode = isSigned ? ISD::SREM : ISD::UREM;
     if (TLI.isOperationLegalOrCustom(Opcode, VT))
       return SDValue();
   } else {
     OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
     if (TLI.isOperationLegalOrCustom(OtherOpcode, VT))
       return SDValue();
   }
 
   SDValue Op0 = Node->getOperand(0);
   SDValue Op1 = Node->getOperand(1);
   SDValue combined;
   for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
          UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
     SDNode *User = *UI;
     if (User == Node || User->getOpcode() == ISD::DELETED_NODE ||
         User->use_empty())
       continue;
     // Convert the other matching node(s), too;
     // otherwise, the DIVREM may get target-legalized into something
     // target-specific that we won't be able to recognize.
     unsigned UserOpc = User->getOpcode();
     if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) &&
         User->getOperand(0) == Op0 &&
         User->getOperand(1) == Op1) {
       if (!combined) {
         if (UserOpc == OtherOpcode) {
           SDVTList VTs = DAG.getVTList(VT, VT);
           combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1);
         } else if (UserOpc == DivRemOpc) {
           combined = SDValue(User, 0);
         } else {
           assert(UserOpc == Opcode);
           continue;
         }
       }
       if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV)
         CombineTo(User, combined);
       else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM)
         CombineTo(User, combined.getValue(1));
     }
   }
   return combined;
 }
 
 static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   if (DAG.isUndef(N->getOpcode(), {N0, N1}))
     return DAG.getUNDEF(VT);
 
   // undef / X -> 0
   // undef % X -> 0
   if (N0.isUndef())
     return DAG.getConstant(0, DL, VT);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT CCVT = getSetCCResultType(VT);
 
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
   SDLoc DL(N);
 
   // fold (sdiv c1, c2) -> c1/c2
   ConstantSDNode *N0C = isConstOrConstSplat(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N0C->isOpaque() && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, N0C, N1C);
   // fold (sdiv X, 1) -> X
   if (N1C && N1C->isOne())
     return N0;
   // fold (sdiv X, -1) -> 0-X
   if (N1C && N1C->isAllOnesValue())
     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
   // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
   if (N1C && N1C->getAPIntValue().isMinSignedValue())
     return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
                          DAG.getConstant(1, DL, VT),
                          DAG.getConstant(0, DL, VT));
 
   if (SDValue V = simplifyDivRem(N, DAG))
     return V;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // If we know the sign bits of both operands are zero, strength reduce to a
   // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
   if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
     return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);
 
   if (SDValue V = visitSDIVLike(N0, N1, N))
     return V;
 
   // sdiv, srem -> sdivrem
   // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
   // true.  Otherwise, we break the simplification logic in visitREM().
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
   if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
     if (SDValue DivRem = useDivRem(N))
         return DivRem;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   EVT CCVT = getSetCCResultType(VT);
   unsigned BitWidth = VT.getScalarSizeInBits();
 
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // Helper for determining whether a value is a power-2 constant scalar or a
   // vector of such elements.
   auto IsPowerOfTwo = [](ConstantSDNode *C) {
     if (C->isNullValue() || C->isOpaque())
       return false;
     if (C->getAPIntValue().isPowerOf2())
       return true;
     if ((-C->getAPIntValue()).isPowerOf2())
       return true;
     return false;
   };
 
   // fold (sdiv X, pow2) -> simple ops after legalize
   // FIXME: We check for the exact bit here because the generic lowering gives
   // better results in that case. The target-specific lowering should learn how
   // to handle exact sdivs efficiently.
   if (!N->getFlags().hasExact() &&
       ISD::matchUnaryPredicate(N1C ? SDValue(N1C, 0) : N1, IsPowerOfTwo)) {
     // Target-specific implementation of sdiv x, pow2.
     if (SDValue Res = BuildSDIVPow2(N))
       return Res;
 
     // Create constants that are functions of the shift amount value.
     EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
     SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy);
     SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1);
     C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy);
     SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1);
     if (!isConstantOrConstantVector(Inexact))
       return SDValue();
 
     // Splat the sign bit into the register
     SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0,
                                DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy));
     AddToWorklist(Sign.getNode());
 
     // Add (N0 < 0) ? abs2 - 1 : 0;
     SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact);
     AddToWorklist(Srl.getNode());
     SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl);
     AddToWorklist(Add.getNode());
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1);
     AddToWorklist(Sra.getNode());
 
     // Special case: (sdiv X, 1) -> X
     // Special Case: (sdiv X, -1) -> 0-X
     SDValue One = DAG.getConstant(1, DL, VT);
     SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
     SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ);
     SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ);
     SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes);
     Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra);
 
     // If dividing by a positive value, we're done. Otherwise, the result must
     // be negated.
     SDValue Zero = DAG.getConstant(0, DL, VT);
     SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra);
 
     // FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding.
     SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT);
     SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra);
     return Res;
   }
 
   // If integer divide is expensive and we satisfy the requirements, emit an
   // alternate sequence.  Targets may check function attributes for size/speed
   // trade-offs.
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
   if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
     if (SDValue Op = BuildSDIV(N))
       return Op;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitUDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT CCVT = getSetCCResultType(VT);
 
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
   SDLoc DL(N);
 
   // fold (udiv c1, c2) -> c1/c2
   ConstantSDNode *N0C = isConstOrConstSplat(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C)
     if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT,
                                                     N0C, N1C))
       return Folded;
   // fold (udiv X, 1) -> X
   if (N1C && N1C->isOne())
     return N0;
   // fold (udiv X, -1) -> select(X == -1, 1, 0)
   if (N1C && N1C->getAPIntValue().isAllOnesValue())
     return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
                          DAG.getConstant(1, DL, VT),
                          DAG.getConstant(0, DL, VT));
 
   if (SDValue V = simplifyDivRem(N, DAG))
     return V;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   if (SDValue V = visitUDIVLike(N0, N1, N))
     return V;
 
   // sdiv, srem -> sdivrem
   // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
   // true.  Otherwise, we break the simplification logic in visitREM().
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
   if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
     if (SDValue DivRem = useDivRem(N))
         return DivRem;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // fold (udiv x, (1 << c)) -> x >>u c
   if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
       DAG.isKnownToBeAPowerOfTwo(N1)) {
     SDValue LogBase2 = BuildLogBase2(N1, DL);
     AddToWorklist(LogBase2.getNode());
 
     EVT ShiftVT = getShiftAmountTy(N0.getValueType());
     SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
     AddToWorklist(Trunc.getNode());
     return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
   }
 
   // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
   if (N1.getOpcode() == ISD::SHL) {
     SDValue N10 = N1.getOperand(0);
     if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) &&
         DAG.isKnownToBeAPowerOfTwo(N10)) {
       SDValue LogBase2 = BuildLogBase2(N10, DL);
       AddToWorklist(LogBase2.getNode());
 
       EVT ADDVT = N1.getOperand(1).getValueType();
       SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT);
       AddToWorklist(Trunc.getNode());
       SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::SRL, DL, VT, N0, Add);
     }
   }
 
   // fold (udiv x, c) -> alternate
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
   if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
     if (SDValue Op = BuildUDIV(N))
       return Op;
 
   return SDValue();
 }
 
 // handles ISD::SREM and ISD::UREM
 SDValue DAGCombiner::visitREM(SDNode *N) {
   unsigned Opcode = N->getOpcode();
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT CCVT = getSetCCResultType(VT);
 
   bool isSigned = (Opcode == ISD::SREM);
   SDLoc DL(N);
 
   // fold (rem c1, c2) -> c1%c2
   ConstantSDNode *N0C = isConstOrConstSplat(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C)
     if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))
       return Folded;
   // fold (urem X, -1) -> select(X == -1, 0, x)
   if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue())
     return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
                          DAG.getConstant(0, DL, VT), N0);
 
   if (SDValue V = simplifyDivRem(N, DAG))
     return V;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   if (isSigned) {
     // If we know the sign bits of both operands are zero, strength reduce to a
     // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
     if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
       return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
   } else {
     SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
     if (DAG.isKnownToBeAPowerOfTwo(N1)) {
       // fold (urem x, pow2) -> (and x, pow2-1)
       SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
     }
     if (N1.getOpcode() == ISD::SHL &&
         DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
       // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
       SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
     }
   }
 
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
 
   // If X/C can be simplified by the division-by-constant logic, lower
   // X%C to the equivalent of X-X/C*C.
   // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the
   // speculative DIV must not cause a DIVREM conversion.  We guard against this
   // by skipping the simplification if isIntDivCheap().  When div is not cheap,
   // combine will not return a DIVREM.  Regardless, checking cheapness here
   // makes sense since the simplification results in fatter code.
   if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
     SDValue OptimizedDiv =
         isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
     if (OptimizedDiv.getNode() && OptimizedDiv.getOpcode() != ISD::UDIVREM &&
         OptimizedDiv.getOpcode() != ISD::SDIVREM) {
       SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
       SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
       AddToWorklist(OptimizedDiv.getNode());
       AddToWorklist(Mul.getNode());
       return Sub;
     }
   }
 
   // sdiv, srem -> sdivrem
   if (SDValue DivRem = useDivRem(N))
     return DivRem.getValue(1);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitMULHS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   if (VT.isVector()) {
     // fold (mulhs x, 0) -> 0
     if (ISD::isBuildVectorAllZeros(N1.getNode()))
       return N1;
     if (ISD::isBuildVectorAllZeros(N0.getNode()))
       return N0;
   }
 
   // fold (mulhs x, 0) -> 0
   if (isNullConstant(N1))
     return N1;
   // fold (mulhs x, 1) -> (sra x, size(x)-1)
   if (isOneConstant(N1))
     return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
                        DAG.getConstant(N0.getValueSizeInBits() - 1, DL,
                                        getShiftAmountTy(N0.getValueType())));
 
   // fold (mulhs x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, DL, VT);
 
   // If the type twice as wide is legal, transform the mulhs to a wider multiply
   // plus a shift.
   if (VT.isSimple() && !VT.isVector()) {
     MVT Simple = VT.getSimpleVT();
     unsigned SimpleSize = Simple.getSizeInBits();
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
     if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
       N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
       N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
       N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
       N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
             DAG.getConstant(SimpleSize, DL,
                             getShiftAmountTy(N1.getValueType())));
       return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitMULHU(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   if (VT.isVector()) {
     // fold (mulhu x, 0) -> 0
     if (ISD::isBuildVectorAllZeros(N1.getNode()))
       return N1;
     if (ISD::isBuildVectorAllZeros(N0.getNode()))
       return N0;
   }
 
   // fold (mulhu x, 0) -> 0
   if (isNullConstant(N1))
     return N1;
   // fold (mulhu x, 1) -> 0
   if (isOneConstant(N1))
     return DAG.getConstant(0, DL, N0.getValueType());
   // fold (mulhu x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, DL, VT);
 
   // If the type twice as wide is legal, transform the mulhu to a wider multiply
   // plus a shift.
   if (VT.isSimple() && !VT.isVector()) {
     MVT Simple = VT.getSimpleVT();
     unsigned SimpleSize = Simple.getSizeInBits();
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
     if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
       N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
       N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
       N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
       N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
             DAG.getConstant(SimpleSize, DL,
                             getShiftAmountTy(N1.getValueType())));
       return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
     }
   }
 
   return SDValue();
 }
 
 /// Perform optimizations common to nodes that compute two values. LoOp and HiOp
 /// give the opcodes for the two computations that are being performed. Return
 /// true if a simplification was made.
 SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                                 unsigned HiOp) {
   // If the high half is not needed, just compute the low half.
   bool HiExists = N->hasAnyUseOfValue(1);
   if (!HiExists &&
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
     SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
     return CombineTo(N, Res, Res);
   }
 
   // If the low half is not needed, just compute the high half.
   bool LoExists = N->hasAnyUseOfValue(0);
   if (!LoExists &&
       (!LegalOperations ||
        TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
     SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
     return CombineTo(N, Res, Res);
   }
 
   // If both halves are used, return as it is.
   if (LoExists && HiExists)
     return SDValue();
 
   // If the two computed results can be simplified separately, separate them.
   if (LoExists) {
     SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
     AddToWorklist(Lo.getNode());
     SDValue LoOpt = combine(Lo.getNode());
     if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
         (!LegalOperations ||
          TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType())))
       return CombineTo(N, LoOpt, LoOpt);
   }
 
   if (HiExists) {
     SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
     AddToWorklist(Hi.getNode());
     SDValue HiOpt = combine(Hi.getNode());
     if (HiOpt.getNode() && HiOpt != Hi &&
         (!LegalOperations ||
          TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType())))
       return CombineTo(N, HiOpt, HiOpt);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
   if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
     return Res;
 
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   // If the type is twice as wide is legal, transform the mulhu to a wider
   // multiply plus a shift.
   if (VT.isSimple() && !VT.isVector()) {
     MVT Simple = VT.getSimpleVT();
     unsigned SimpleSize = Simple.getSizeInBits();
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
     if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
       SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0));
       SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1));
       Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
       // Compute the high part as N1.
       Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
             DAG.getConstant(SimpleSize, DL,
                             getShiftAmountTy(Lo.getValueType())));
       Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
       // Compute the low part as N0.
       Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
       return CombineTo(N, Lo, Hi);
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
   if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
     return Res;
 
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   // If the type is twice as wide is legal, transform the mulhu to a wider
   // multiply plus a shift.
   if (VT.isSimple() && !VT.isVector()) {
     MVT Simple = VT.getSimpleVT();
     unsigned SimpleSize = Simple.getSizeInBits();
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
     if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
       SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0));
       SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1));
       Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
       // Compute the high part as N1.
       Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
             DAG.getConstant(SimpleSize, DL,
                             getShiftAmountTy(Lo.getValueType())));
       Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
       // Compute the low part as N0.
       Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
       return CombineTo(N, Lo, Hi);
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSMULO(SDNode *N) {
   // (smulo x, 2) -> (saddo x, x)
   if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     if (C2->getAPIntValue() == 2)
       return DAG.getNode(ISD::SADDO, SDLoc(N), N->getVTList(),
                          N->getOperand(0), N->getOperand(0));
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitUMULO(SDNode *N) {
   // (umulo x, 2) -> (uaddo x, x)
   if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     if (C2->getAPIntValue() == 2)
       return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(),
                          N->getOperand(0), N->getOperand(0));
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
 
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
   // fold operation with constant operands.
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C);
 
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
 
   // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
   // Only do this if the current op isn't legal and the flipped is.
   unsigned Opcode = N->getOpcode();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isOperationLegal(Opcode, VT) &&
       (N0.isUndef() || DAG.SignBitIsZero(N0)) &&
       (N1.isUndef() || DAG.SignBitIsZero(N1))) {
     unsigned AltOpcode;
     switch (Opcode) {
     case ISD::SMIN: AltOpcode = ISD::UMIN; break;
     case ISD::SMAX: AltOpcode = ISD::UMAX; break;
     case ISD::UMIN: AltOpcode = ISD::SMIN; break;
     case ISD::UMAX: AltOpcode = ISD::SMAX; break;
     default: llvm_unreachable("Unknown MINMAX opcode");
     }
     if (TLI.isOperationLegal(AltOpcode, VT))
       return DAG.getNode(AltOpcode, SDLoc(N), VT, N0, N1);
   }
 
   return SDValue();
 }
 
 /// If this is a binary operator with two operands of the same opcode, try to
 /// simplify it.
 SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   assert(N0.getOpcode() == N1.getOpcode() && "Bad input!");
 
   // Bail early if none of these transforms apply.
   if (N0.getNumOperands() == 0) return SDValue();
 
   // For each of OP in AND/OR/XOR:
   // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
   // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
   // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
   // fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y))
   // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
   //
   // do not sink logical op inside of a vector extend, since it may combine
   // into a vsetcc.
   EVT Op0VT = N0.getOperand(0).getValueType();
   if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
        N0.getOpcode() == ISD::SIGN_EXTEND ||
        N0.getOpcode() == ISD::BSWAP ||
        // Avoid infinite looping with PromoteIntBinOp.
        (N0.getOpcode() == ISD::ANY_EXTEND &&
         (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) ||
        (N0.getOpcode() == ISD::TRUNCATE &&
         (!TLI.isZExtFree(VT, Op0VT) ||
          !TLI.isTruncateFree(Op0VT, VT)) &&
         TLI.isTypeLegal(Op0VT))) &&
       !VT.isVector() &&
       Op0VT == N1.getOperand(0).getValueType() &&
       (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
     SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
                                  N0.getOperand(0).getValueType(),
                                  N0.getOperand(0), N1.getOperand(0));
     AddToWorklist(ORNode.getNode());
     return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode);
   }
 
   // For each of OP in SHL/SRL/SRA/AND...
   //   fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z)
   //   fold (or  (OP x, z), (OP y, z)) -> (OP (or  x, y), z)
   //   fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z)
   if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL ||
        N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) &&
       N0.getOperand(1) == N1.getOperand(1)) {
     SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
                                  N0.getOperand(0).getValueType(),
                                  N0.getOperand(0), N1.getOperand(0));
     AddToWorklist(ORNode.getNode());
     return DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
                        ORNode, N0.getOperand(1));
   }
 
   // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
   // Only perform this optimization up until type legalization, before
   // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
   // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
   // we don't want to undo this promotion.
   // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
   // on scalars.
   if ((N0.getOpcode() == ISD::BITCAST ||
        N0.getOpcode() == ISD::SCALAR_TO_VECTOR) &&
        Level <= AfterLegalizeTypes) {
     SDValue In0 = N0.getOperand(0);
     SDValue In1 = N1.getOperand(0);
     EVT In0Ty = In0.getValueType();
     EVT In1Ty = In1.getValueType();
     SDLoc DL(N);
     // If both incoming values are integers, and the original types are the
     // same.
     if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
       SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1);
       SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op);
       AddToWorklist(Op.getNode());
       return BC;
     }
   }
 
   // Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
   // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
   // If both shuffles use the same mask, and both shuffle within a single
   // vector, then it is worthwhile to move the swizzle after the operation.
   // The type-legalizer generates this pattern when loading illegal
   // vector types from memory. In many cases this allows additional shuffle
   // optimizations.
   // There are other cases where moving the shuffle after the xor/and/or
   // is profitable even if shuffles don't perform a swizzle.
   // If both shuffles use the same mask, and both shuffles have the same first
   // or second operand, then it might still be profitable to move the shuffle
   // after the xor/and/or operation.
   if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
     ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(N0);
     ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(N1);
 
     assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
            "Inputs to shuffles are not the same type");
 
     // Check that both shuffles use the same mask. The masks are known to be of
     // the same length because the result vector type is the same.
     // Check also that shuffles have only one use to avoid introducing extra
     // instructions.
     if (SVN0->hasOneUse() && SVN1->hasOneUse() &&
         SVN0->getMask().equals(SVN1->getMask())) {
       SDValue ShOp = N0->getOperand(1);
 
       // Don't try to fold this node if it requires introducing a
       // build vector of all zeros that might be illegal at this stage.
       if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
         if (!LegalTypes)
           ShOp = DAG.getConstant(0, SDLoc(N), VT);
         else
           ShOp = SDValue();
       }
 
       // (AND (shuf (A, C), shuf (B, C))) -> shuf (AND (A, B), C)
       // (OR  (shuf (A, C), shuf (B, C))) -> shuf (OR  (A, B), C)
       // (XOR (shuf (A, C), shuf (B, C))) -> shuf (XOR (A, B), V_0)
       if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
         SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                       N0->getOperand(0), N1->getOperand(0));
         AddToWorklist(NewNode.getNode());
         return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp,
                                     SVN0->getMask());
       }
 
       // Don't try to fold this node if it requires introducing a
       // build vector of all zeros that might be illegal at this stage.
       ShOp = N0->getOperand(0);
       if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
         if (!LegalTypes)
           ShOp = DAG.getConstant(0, SDLoc(N), VT);
         else
           ShOp = SDValue();
       }
 
       // (AND (shuf (C, A), shuf (C, B))) -> shuf (C, AND (A, B))
       // (OR  (shuf (C, A), shuf (C, B))) -> shuf (C, OR  (A, B))
       // (XOR (shuf (C, A), shuf (C, B))) -> shuf (V_0, XOR (A, B))
       if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) {
         SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                       N0->getOperand(1), N1->getOperand(1));
         AddToWorklist(NewNode.getNode());
         return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode,
                                     SVN0->getMask());
       }
     }
   }
 
   return SDValue();
 }
 
 /// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
 SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
                                        const SDLoc &DL) {
   SDValue LL, LR, RL, RR, N0CC, N1CC;
   if (!isSetCCEquivalent(N0, LL, LR, N0CC) ||
       !isSetCCEquivalent(N1, RL, RR, N1CC))
     return SDValue();
 
   assert(N0.getValueType() == N1.getValueType() &&
          "Unexpected operand types for bitwise logic op");
   assert(LL.getValueType() == LR.getValueType() &&
          RL.getValueType() == RR.getValueType() &&
          "Unexpected operand types for setcc");
 
   // If we're here post-legalization or the logic op type is not i1, the logic
   // op type must match a setcc result type. Also, all folds require new
   // operations on the left and right operands, so those types must match.
   EVT VT = N0.getValueType();
   EVT OpVT = LL.getValueType();
   if (LegalOperations || VT.getScalarType() != MVT::i1)
     if (VT != getSetCCResultType(OpVT))
       return SDValue();
   if (OpVT != RL.getValueType())
     return SDValue();
 
   ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
   ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
   bool IsInteger = OpVT.isInteger();
   if (LR == RR && CC0 == CC1 && IsInteger) {
     bool IsZero = isNullConstantOrNullSplatConstant(LR);
     bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR);
 
     // All bits clear?
     bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
     // All sign bits clear?
     bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
     // Any bits set?
     bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
     // Any sign bits set?
     bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;
 
     // (and (seteq X,  0), (seteq Y,  0)) --> (seteq (or X, Y),  0)
     // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
     // (or  (setne X,  0), (setne Y,  0)) --> (setne (or X, Y),  0)
     // (or  (setlt X,  0), (setlt Y,  0)) --> (setlt (or X, Y),  0)
     if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) {
       SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
       AddToWorklist(Or.getNode());
       return DAG.getSetCC(DL, VT, Or, LR, CC1);
     }
 
     // All bits set?
     bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
     // All sign bits set?
     bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
     // Any bits clear?
     bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
     // Any sign bits clear?
     bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;
 
     // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
     // (and (setlt X,  0), (setlt Y,  0)) --> (setlt (and X, Y),  0)
     // (or  (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
     // (or  (setgt X, -1), (setgt Y  -1)) --> (setgt (and X, Y), -1)
     if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) {
       SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
       AddToWorklist(And.getNode());
       return DAG.getSetCC(DL, VT, And, LR, CC1);
     }
   }
 
   // TODO: What is the 'or' equivalent of this fold?
   // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
   if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 &&
       IsInteger && CC0 == ISD::SETNE &&
       ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
        (isAllOnesConstant(LR) && isNullConstant(RR)))) {
     SDValue One = DAG.getConstant(1, DL, OpVT);
     SDValue Two = DAG.getConstant(2, DL, OpVT);
     SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
     AddToWorklist(Add.getNode());
     return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
   }
 
   // Try more general transforms if the predicates match and the only user of
   // the compares is the 'and' or 'or'.
   if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
       N0.hasOneUse() && N1.hasOneUse()) {
     // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
     // or  (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
     if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) {
       SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
       SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
       SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
       SDValue Zero = DAG.getConstant(0, DL, OpVT);
       return DAG.getSetCC(DL, VT, Or, Zero, CC1);
     }
   }
 
   // Canonicalize equivalent operands to LL == RL.
   if (LL == RR && LR == RL) {
     CC1 = ISD::getSetCCSwappedOperands(CC1);
     std::swap(RL, RR);
   }
 
   // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
   // (or  (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
   if (LL == RL && LR == RR) {
     ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger)
                                 : ISD::getSetCCOrOperation(CC0, CC1, IsInteger);
     if (NewCC != ISD::SETCC_INVALID &&
         (!LegalOperations ||
          (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
           TLI.isOperationLegal(ISD::SETCC, OpVT))))
       return DAG.getSetCC(DL, VT, LL, LR, NewCC);
   }
 
   return SDValue();
 }
 
 /// This contains all DAGCombine rules which reduce two values combined by
 /// an And operation to a single value. This makes them reusable in the context
 /// of visitSELECT(). Rules involving constants are not included as
 /// visitSELECT() already handles those cases.
 SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
   EVT VT = N1.getValueType();
   SDLoc DL(N);
 
   // fold (and x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, DL, VT);
 
   if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
     return V;
 
   if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
       VT.getSizeInBits() <= 64) {
     if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
       if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
         // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
         // immediate for an add, but it is legal if its top c2 bits are set,
         // transform the ADD so the immediate doesn't need to be materialized
         // in a register.
         APInt ADDC = ADDI->getAPIntValue();
         APInt SRLC = SRLI->getAPIntValue();
         if (ADDC.getMinSignedBits() <= 64 &&
             SRLC.ult(VT.getSizeInBits()) &&
             !TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
           APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                              SRLC.getZExtValue());
           if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
             ADDC |= Mask;
             if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
               SDLoc DL0(N0);
               SDValue NewAdd =
                 DAG.getNode(ISD::ADD, DL0, VT,
                             N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
               CombineTo(N0.getNode(), NewAdd);
               // Return N so it doesn't get rechecked!
               return SDValue(N, 0);
             }
           }
         }
       }
     }
   }
 
   // Reduce bit extract of low half of an integer to the narrower type.
   // (and (srl i64:x, K), KMask) ->
   //   (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask)
   if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
     if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) {
       if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
         unsigned Size = VT.getSizeInBits();
         const APInt &AndMask = CAnd->getAPIntValue();
         unsigned ShiftBits = CShift->getZExtValue();
 
         // Bail out, this node will probably disappear anyway.
         if (ShiftBits == 0)
           return SDValue();
 
         unsigned MaskBits = AndMask.countTrailingOnes();
         EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);
 
         if (AndMask.isMask() &&
             // Required bits must not span the two halves of the integer and
             // must fit in the half size type.
             (ShiftBits + MaskBits <= Size / 2) &&
             TLI.isNarrowingProfitable(VT, HalfVT) &&
             TLI.isTypeDesirableForOp(ISD::AND, HalfVT) &&
             TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) &&
             TLI.isTruncateFree(VT, HalfVT) &&
             TLI.isZExtFree(HalfVT, VT)) {
           // The isNarrowingProfitable is to avoid regressions on PPC and
           // AArch64 which match a few 64-bit bit insert / bit extract patterns
           // on downstream users of this. Those patterns could probably be
           // extended to handle extensions mixed in.
 
           SDValue SL(N0);
           assert(MaskBits <= Size);
 
           // Extracting the highest bit of the low half.
           EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
           SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT,
                                       N0.getOperand(0));
 
           SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT);
           SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT);
           SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK);
           SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask);
           return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And);
         }
       }
     }
   }
 
   return SDValue();
 }
 
 bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
                                    EVT LoadResultTy, EVT &ExtVT) {
   if (!AndC->getAPIntValue().isMask())
     return false;
 
   unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();
 
   ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
   EVT LoadedVT = LoadN->getMemoryVT();
 
   if (ExtVT == LoadedVT &&
       (!LegalOperations ||
        TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
     // ZEXTLOAD will match without needing to change the size of the value being
     // loaded.
     return true;
   }
 
   // Do not change the width of a volatile load.
   if (LoadN->isVolatile())
     return false;
 
   // Do not generate loads of non-round integer types since these can
   // be expensive (and would be wrong if the type is not byte sized).
   if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound())
     return false;
 
   if (LegalOperations &&
       !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
     return false;
 
   if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
     return false;
 
   return true;
 }
 
 bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
                                     ISD::LoadExtType ExtType, EVT &MemVT,
                                     unsigned ShAmt) {
   if (!LDST)
     return false;
   // Only allow byte offsets.
   if (ShAmt % 8)
     return false;
 
   // Do not generate loads of non-round integer types since these can
   // be expensive (and would be wrong if the type is not byte sized).
   if (!MemVT.isRound())
     return false;
 
   // Don't change the width of a volatile load.
   if (LDST->isVolatile())
     return false;
 
   // Verify that we are actually reducing a load width here.
   if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits())
     return false;
 
   // Ensure that this isn't going to produce an unsupported unaligned access.
   if (ShAmt &&
       !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
                               LDST->getAddressSpace(), ShAmt / 8))
     return false;
 
   // It's not possible to generate a constant of extended or untyped type.
   EVT PtrType = LDST->getBasePtr().getValueType();
   if (PtrType == MVT::Untyped || PtrType.isExtended())
     return false;
 
   if (isa<LoadSDNode>(LDST)) {
     LoadSDNode *Load = cast<LoadSDNode>(LDST);
     // Don't transform one with multiple uses, this would require adding a new
     // load.
     if (!SDValue(Load, 0).hasOneUse())
       return false;
 
     if (LegalOperations &&
         !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT))
       return false;
 
     // For the transform to be legal, the load must produce only two values
     // (the value loaded and the chain).  Don't transform a pre-increment
     // load, for example, which produces an extra value.  Otherwise the
     // transformation is not equivalent, and the downstream logic to replace
     // uses gets things wrong.
     if (Load->getNumValues() > 2)
       return false;
 
     // If the load that we're shrinking is an extload and we're not just
     // discarding the extension we can't simply shrink the load. Bail.
     // TODO: It would be possible to merge the extensions in some cases.
     if (Load->getExtensionType() != ISD::NON_EXTLOAD &&
         Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
       return false;
 
     if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT))
       return false;
   } else {
     assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode");
     StoreSDNode *Store = cast<StoreSDNode>(LDST);
     // Can't write outside the original store
     if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
       return false;
 
     if (LegalOperations &&
         !TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT))
       return false;
   }
   return true;
 }
 
 bool DAGCombiner::SearchForAndLoads(SDNode *N,
                                     SmallPtrSetImpl<LoadSDNode*> &Loads,
                                     SmallPtrSetImpl<SDNode*> &NodesWithConsts,
                                     ConstantSDNode *Mask,
                                     SDNode *&NodeToMask) {
   // Recursively search for the operands, looking for loads which can be
   // narrowed.
   for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) {
     SDValue Op = N->getOperand(i);
 
     if (Op.getValueType().isVector())
       return false;
 
     // Some constants may need fixing up later if they are too large.
     if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
       if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) &&
           (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
         NodesWithConsts.insert(N);
       continue;
     }
 
     if (!Op.hasOneUse())
       return false;
 
     switch(Op.getOpcode()) {
     case ISD::LOAD: {
       auto *Load = cast<LoadSDNode>(Op);
       EVT ExtVT;
       if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
           isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) {
 
         // ZEXTLOAD is already small enough.
         if (Load->getExtensionType() == ISD::ZEXTLOAD &&
             ExtVT.bitsGE(Load->getMemoryVT()))
           continue;
 
         // Use LE to convert equal sized loads to zext.
         if (ExtVT.bitsLE(Load->getMemoryVT()))
           Loads.insert(Load);
 
         continue;
       }
       return false;
     }
     case ISD::ZERO_EXTEND:
     case ISD::AssertZext: {
       unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
       EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
       EVT VT = Op.getOpcode() == ISD::AssertZext ?
         cast<VTSDNode>(Op.getOperand(1))->getVT() :
         Op.getOperand(0).getValueType();
 
       // We can accept extending nodes if the mask is wider or an equal
       // width to the original type.
       if (ExtVT.bitsGE(VT))
         continue;
       break;
     }
     case ISD::OR:
     case ISD::XOR:
     case ISD::AND:
       if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask,
                              NodeToMask))
         return false;
       continue;
     }
 
     // Allow one node which will masked along with any loads found.
     if (NodeToMask)
       return false;
 
     // Also ensure that the node to be masked only produces one data result.
     NodeToMask = Op.getNode();
     if (NodeToMask->getNumValues() > 1) {
       bool HasValue = false;
       for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) {
         MVT VT = SDValue(NodeToMask, i).getSimpleValueType();
         if (VT != MVT::Glue && VT != MVT::Other) {
           if (HasValue) {
             NodeToMask = nullptr;
             return false;
           }
           HasValue = true;
         }
       }
       assert(HasValue && "Node to be masked has no data result?");
     }
   }
   return true;
 }
 
 bool DAGCombiner::BackwardsPropagateMask(SDNode *N, SelectionDAG &DAG) {
   auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!Mask)
     return false;
 
   if (!Mask->getAPIntValue().isMask())
     return false;
 
   // No need to do anything if the and directly uses a load.
   if (isa<LoadSDNode>(N->getOperand(0)))
     return false;
 
   SmallPtrSet<LoadSDNode*, 8> Loads;
   SmallPtrSet<SDNode*, 2> NodesWithConsts;
   SDNode *FixupNode = nullptr;
   if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) {
     if (Loads.size() == 0)
       return false;
 
     LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
     SDValue MaskOp = N->getOperand(1);
 
     // If it exists, fixup the single node we allow in the tree that needs
     // masking.
     if (FixupNode) {
       LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
       SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
                                 FixupNode->getValueType(0),
                                 SDValue(FixupNode, 0), MaskOp);
       DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
       if (And.getOpcode() == ISD ::AND)
         DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp);
     }
 
     // Narrow any constants that need it.
     for (auto *LogicN : NodesWithConsts) {
       SDValue Op0 = LogicN->getOperand(0);
       SDValue Op1 = LogicN->getOperand(1);
 
       if (isa<ConstantSDNode>(Op0))
           std::swap(Op0, Op1);
 
       SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
                                 Op1, MaskOp);
 
       DAG.UpdateNodeOperands(LogicN, Op0, And);
     }
 
     // Create narrow loads.
     for (auto *Load : Loads) {
       LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
       SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
                                 SDValue(Load, 0), MaskOp);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
       if (And.getOpcode() == ISD ::AND)
         And = SDValue(
             DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0);
       SDValue NewLoad = ReduceLoadWidth(And.getNode());
       assert(NewLoad &&
              "Shouldn't be masking the load if it can't be narrowed");
       CombineTo(Load, NewLoad, NewLoad.getValue(1));
     }
     DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode());
     return true;
   }
   return false;
 }
 
 // Unfold
 //    x &  (-1 'logical shift' y)
 // To
 //    (x 'opposite logical shift' y) 'logical shift' y
 // if it is better for performance.
 SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) {
   assert(N->getOpcode() == ISD::AND);
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // Do we actually prefer shifts over mask?
   if (!TLI.preferShiftsToClearExtremeBits(N0))
     return SDValue();
 
   // Try to match  (-1 '[outer] logical shift' y)
   unsigned OuterShift;
   unsigned InnerShift; // The opposite direction to the OuterShift.
   SDValue Y;           // Shift amount.
   auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool {
     if (!M.hasOneUse())
       return false;
     OuterShift = M->getOpcode();
     if (OuterShift == ISD::SHL)
       InnerShift = ISD::SRL;
     else if (OuterShift == ISD::SRL)
       InnerShift = ISD::SHL;
     else
       return false;
     if (!isAllOnesConstant(M->getOperand(0)))
       return false;
     Y = M->getOperand(1);
     return true;
   };
 
   SDValue X;
   if (matchMask(N1))
     X = N0;
   else if (matchMask(N0))
     X = N1;
   else
     return SDValue();
 
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   //     tmp = x   'opposite logical shift' y
   SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y);
   //     ret = tmp 'logical shift' y
   SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y);
 
   return T1;
 }
 
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N1.getValueType();
 
   // x & x --> x
   if (N0 == N1)
     return N0;
 
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
     // fold (and x, 0) -> 0, vector edition
     if (ISD::isBuildVectorAllZeros(N0.getNode()))
       // do not return N0, because undef node may exist in N0
       return DAG.getConstant(APInt::getNullValue(N0.getScalarValueSizeInBits()),
                              SDLoc(N), N0.getValueType());
     if (ISD::isBuildVectorAllZeros(N1.getNode()))
       // do not return N1, because undef node may exist in N1
       return DAG.getConstant(APInt::getNullValue(N1.getScalarValueSizeInBits()),
                              SDLoc(N), N1.getValueType());
 
     // fold (and x, -1) -> x, vector edition
     if (ISD::isBuildVectorAllOnes(N0.getNode()))
       return N1;
     if (ISD::isBuildVectorAllOnes(N1.getNode()))
       return N0;
   }
 
   // fold (and c1, c2) -> c1&c2
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, N0C, N1C);
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
   // fold (and x, -1) -> x
   if (isAllOnesConstant(N1))
     return N0;
   // if (and x, c) is known to be zero, return 0
   unsigned BitWidth = VT.getScalarSizeInBits();
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
                                    APInt::getAllOnesValue(BitWidth)))
     return DAG.getConstant(0, SDLoc(N), VT);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // reassociate and
   if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1))
     return RAND;
 
   // Try to convert a constant mask AND into a shuffle clear mask.
   if (VT.isVector())
     if (SDValue Shuffle = XformToShuffleWithZero(N))
       return Shuffle;
 
   // fold (and (or x, C), D) -> D if (C & D) == D
   auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
     return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
   };
   if (N0.getOpcode() == ISD::OR &&
       ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
     return N1;
   // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
   if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
     SDValue N0Op0 = N0.getOperand(0);
     APInt Mask = ~N1C->getAPIntValue();
     Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits());
     if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
       SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
                                  N0.getValueType(), N0Op0);
 
       // Replace uses of the AND with uses of the Zero extend node.
       CombineTo(N, Zext);
 
       // We actually want to replace all uses of the any_extend with the
       // zero_extend, to avoid duplicating things.  This will later cause this
       // AND to be folded.
       CombineTo(N0.getNode(), Zext);
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
   // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) ->
   // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must
   // already be zero by virtue of the width of the base type of the load.
   //
   // the 'X' node here can either be nothing or an extract_vector_elt to catch
   // more cases.
   if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
        N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() &&
        N0.getOperand(0).getOpcode() == ISD::LOAD &&
        N0.getOperand(0).getResNo() == 0) ||
       (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
     LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
                                          N0 : N0.getOperand(0) );
 
     // Get the constant (if applicable) the zero'th operand is being ANDed with.
     // This can be a pure constant or a vector splat, in which case we treat the
     // vector as a scalar and use the splat value.
     APInt Constant = APInt::getNullValue(1);
     if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
       Constant = C->getAPIntValue();
     } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
       APInt SplatValue, SplatUndef;
       unsigned SplatBitSize;
       bool HasAnyUndefs;
       bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef,
                                              SplatBitSize, HasAnyUndefs);
       if (IsSplat) {
         // Undef bits can contribute to a possible optimisation if set, so
         // set them.
         SplatValue |= SplatUndef;
 
         // The splat value may be something like "0x00FFFFFF", which means 0 for
         // the first vector value and FF for the rest, repeating. We need a mask
         // that will apply equally to all members of the vector, so AND all the
         // lanes of the constant together.
         EVT VT = Vector->getValueType(0);
         unsigned BitWidth = VT.getScalarSizeInBits();
 
         // If the splat value has been compressed to a bitlength lower
         // than the size of the vector lane, we need to re-expand it to
         // the lane size.
         if (BitWidth > SplatBitSize)
           for (SplatValue = SplatValue.zextOrTrunc(BitWidth);
                SplatBitSize < BitWidth;
                SplatBitSize = SplatBitSize * 2)
             SplatValue |= SplatValue.shl(SplatBitSize);
 
         // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
         // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
         if (SplatBitSize % BitWidth == 0) {
           Constant = APInt::getAllOnesValue(BitWidth);
           for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i)
             Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth);
         }
       }
     }
 
     // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is
     // actually legal and isn't going to get expanded, else this is a false
     // optimisation.
     bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
                                                     Load->getValueType(0),
                                                     Load->getMemoryVT());
 
     // Resize the constant to the same size as the original memory access before
     // extension. If it is still the AllOnesValue then this AND is completely
     // unneeded.
     Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits());
 
     bool B;
     switch (Load->getExtensionType()) {
     default: B = false; break;
     case ISD::EXTLOAD: B = CanZextLoadProfitably; break;
     case ISD::ZEXTLOAD:
     case ISD::NON_EXTLOAD: B = true; break;
     }
 
     if (B && Constant.isAllOnesValue()) {
       // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
       // preserve semantics once we get rid of the AND.
       SDValue NewLoad(Load, 0);
 
       // Fold the AND away. NewLoad may get replaced immediately.
       CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
 
       if (Load->getExtensionType() == ISD::EXTLOAD) {
         NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
                               Load->getValueType(0), SDLoc(Load),
                               Load->getChain(), Load->getBasePtr(),
                               Load->getOffset(), Load->getMemoryVT(),
                               Load->getMemOperand());
         // Replace uses of the EXTLOAD with the new ZEXTLOAD.
         if (Load->getNumValues() == 3) {
           // PRE/POST_INC loads have 3 values.
           SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
                            NewLoad.getValue(2) };
           CombineTo(Load, To, 3, true);
         } else {
           CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
         }
       }
 
       return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
 
   // fold (and (load x), 255) -> (zextload x, i8)
   // fold (and (extload x, i16), 255) -> (zextload x, i8)
   // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
   if (!VT.isVector() && N1C && (N0.getOpcode() == ISD::LOAD ||
                                 (N0.getOpcode() == ISD::ANY_EXTEND &&
                                  N0.getOperand(0).getOpcode() == ISD::LOAD))) {
     if (SDValue Res = ReduceLoadWidth(N)) {
       LoadSDNode *LN0 = N0->getOpcode() == ISD::ANY_EXTEND
         ? cast<LoadSDNode>(N0.getOperand(0)) : cast<LoadSDNode>(N0);
 
       AddToWorklist(N);
       CombineTo(LN0, Res, Res.getValue(1));
       return SDValue(N, 0);
     }
   }
 
   if (Level >= AfterLegalizeTypes) {
     // Attempt to propagate the AND back up to the leaves which, if they're
     // loads, can be combined to narrow loads and the AND node can be removed.
     // Perform after legalization so that extend nodes will already be
     // combined into the loads.
     if (BackwardsPropagateMask(N, DAG)) {
       return SDValue(N, 0);
     }
   }
 
   if (SDValue Combined = visitANDLike(N0, N1, N))
     return Combined;
 
   // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
   if (N0.getOpcode() == N1.getOpcode())
     if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
       return Tmp;
 
   // Masking the negated extension of a boolean is just the zero-extended
   // boolean:
   // and (sub 0, zext(bool X)), 1 --> zext(bool X)
   // and (sub 0, sext(bool X)), 1 --> zext(bool X)
   //
   // Note: the SimplifyDemandedBits fold below can make an information-losing
   // transform, and then we have no way to find this better fold.
   if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
     if (isNullConstantOrNullSplatConstant(N0.getOperand(0))) {
       SDValue SubRHS = N0.getOperand(1);
       if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
           SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
         return SubRHS;
       if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
           SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
         return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0));
     }
   }
 
   // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
   // fold (and (sra)) -> (and (srl)) when possible.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // fold (zext_inreg (extload x)) -> (zextload x)
   if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
     // If we zero all the possible extended bits, then we can turn this into
     // a zextload if we are running before legalize or the operation is legal.
     unsigned BitWidth = N1.getScalarValueSizeInBits();
     if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
                            BitWidth - MemVT.getScalarSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
       AddToWorklist(N);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
   // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
   if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
       N0.hasOneUse()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
     // If we zero all the possible extended bits, then we can turn this into
     // a zextload if we are running before legalize or the operation is legal.
     unsigned BitWidth = N1.getScalarValueSizeInBits();
     if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
                            BitWidth - MemVT.getScalarSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
       AddToWorklist(N);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
   // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
   if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
     if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
                                            N0.getOperand(1), false))
       return BSwap;
   }
 
   if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N))
     return Shifts;
 
   return SDValue();
 }
 
 /// Match (a >> 8) | (a << 8) as (bswap a) >> 16.
 SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                         bool DemandHighBits) {
   if (!LegalOperations)
     return SDValue();
 
   EVT VT = N->getValueType(0);
   if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
     return SDValue();
   if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
     return SDValue();
 
   // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff)
   bool LookPassAnd0 = false;
   bool LookPassAnd1 = false;
   if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
       std::swap(N0, N1);
   if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
       std::swap(N0, N1);
   if (N0.getOpcode() == ISD::AND) {
     if (!N0.getNode()->hasOneUse())
       return SDValue();
     ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
     // Also handle 0xffff since the LHS is guaranteed to have zeros there.
     // This is needed for X86.
     if (!N01C || (N01C->getZExtValue() != 0xFF00 &&
                   N01C->getZExtValue() != 0xFFFF))
       return SDValue();
     N0 = N0.getOperand(0);
     LookPassAnd0 = true;
   }
 
   if (N1.getOpcode() == ISD::AND) {
     if (!N1.getNode()->hasOneUse())
       return SDValue();
     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
     if (!N11C || N11C->getZExtValue() != 0xFF)
       return SDValue();
     N1 = N1.getOperand(0);
     LookPassAnd1 = true;
   }
 
   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
     std::swap(N0, N1);
   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
     return SDValue();
   if (!N0.getNode()->hasOneUse() || !N1.getNode()->hasOneUse())
     return SDValue();
 
   ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
   if (!N01C || !N11C)
     return SDValue();
   if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8)
     return SDValue();
 
   // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
   SDValue N00 = N0->getOperand(0);
   if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
     if (!N00.getNode()->hasOneUse())
       return SDValue();
     ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
     if (!N001C || N001C->getZExtValue() != 0xFF)
       return SDValue();
     N00 = N00.getOperand(0);
     LookPassAnd0 = true;
   }
 
   SDValue N10 = N1->getOperand(0);
   if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
     if (!N10.getNode()->hasOneUse())
       return SDValue();
     ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
     // Also allow 0xFFFF since the bits will be shifted out. This is needed
     // for X86.
     if (!N101C || (N101C->getZExtValue() != 0xFF00 &&
                    N101C->getZExtValue() != 0xFFFF))
       return SDValue();
     N10 = N10.getOperand(0);
     LookPassAnd1 = true;
   }
 
   if (N00 != N10)
     return SDValue();
 
   // Make sure everything beyond the low halfword gets set to zero since the SRL
   // 16 will clear the top bits.
   unsigned OpSizeInBits = VT.getSizeInBits();
   if (DemandHighBits && OpSizeInBits > 16) {
     // If the left-shift isn't masked out then the only way this is a bswap is
     // if all bits beyond the low 8 are 0. In that case the entire pattern
     // reduces to a left shift anyway: leave it for other parts of the combiner.
     if (!LookPassAnd0)
       return SDValue();
 
     // However, if the right shift isn't masked out then it might be because
     // it's not needed. See if we can spot that too.
     if (!LookPassAnd1 &&
         !DAG.MaskedValueIsZero(
             N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16)))
       return SDValue();
   }
 
   SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
   if (OpSizeInBits > 16) {
     SDLoc DL(N);
     Res = DAG.getNode(ISD::SRL, DL, VT, Res,
                       DAG.getConstant(OpSizeInBits - 16, DL,
                                       getShiftAmountTy(VT)));
   }
   return Res;
 }
 
 /// Return true if the specified node is an element that makes up a 32-bit
 /// packed halfword byteswap.
 /// ((x & 0x000000ff) << 8) |
 /// ((x & 0x0000ff00) >> 8) |
 /// ((x & 0x00ff0000) << 8) |
 /// ((x & 0xff000000) >> 8)
 static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
   if (!N.getNode()->hasOneUse())
     return false;
 
   unsigned Opc = N.getOpcode();
   if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
     return false;
 
   SDValue N0 = N.getOperand(0);
   unsigned Opc0 = N0.getOpcode();
   if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL)
     return false;
 
   ConstantSDNode *N1C = nullptr;
   // SHL or SRL: look upstream for AND mask operand
   if (Opc == ISD::AND)
     N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
   else if (Opc0 == ISD::AND)
     N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   if (!N1C)
     return false;
 
   unsigned MaskByteOffset;
   switch (N1C->getZExtValue()) {
   default:
     return false;
   case 0xFF:       MaskByteOffset = 0; break;
   case 0xFF00:     MaskByteOffset = 1; break;
   case 0xFFFF:
     // In case demanded bits didn't clear the bits that will be shifted out.
     // This is needed for X86.
     if (Opc == ISD::SRL || (Opc == ISD::AND && Opc0 == ISD::SHL)) {
       MaskByteOffset = 1;
       break;
     }
     return false;
   case 0xFF0000:   MaskByteOffset = 2; break;
   case 0xFF000000: MaskByteOffset = 3; break;
   }
 
   // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
   if (Opc == ISD::AND) {
     if (MaskByteOffset == 0 || MaskByteOffset == 2) {
       // (x >> 8) & 0xff
       // (x >> 8) & 0xff0000
       if (Opc0 != ISD::SRL)
         return false;
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
       if (!C || C->getZExtValue() != 8)
         return false;
     } else {
       // (x << 8) & 0xff00
       // (x << 8) & 0xff000000
       if (Opc0 != ISD::SHL)
         return false;
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
       if (!C || C->getZExtValue() != 8)
         return false;
     }
   } else if (Opc == ISD::SHL) {
     // (x & 0xff) << 8
     // (x & 0xff0000) << 8
     if (MaskByteOffset != 0 && MaskByteOffset != 2)
       return false;
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
     if (!C || C->getZExtValue() != 8)
       return false;
   } else { // Opc == ISD::SRL
     // (x & 0xff00) >> 8
     // (x & 0xff000000) >> 8
     if (MaskByteOffset != 1 && MaskByteOffset != 3)
       return false;
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
     if (!C || C->getZExtValue() != 8)
       return false;
   }
 
   if (Parts[MaskByteOffset])
     return false;
 
   Parts[MaskByteOffset] = N0.getOperand(0).getNode();
   return true;
 }
 
 /// Match a 32-bit packed halfword bswap. That is
 /// ((x & 0x000000ff) << 8) |
 /// ((x & 0x0000ff00) >> 8) |
 /// ((x & 0x00ff0000) << 8) |
 /// ((x & 0xff000000) >> 8)
 /// => (rotl (bswap x), 16)
 SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   if (!LegalOperations)
     return SDValue();
 
   EVT VT = N->getValueType(0);
   if (VT != MVT::i32)
     return SDValue();
   if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
     return SDValue();
 
   // Look for either
   // (or (or (and), (and)), (or (and), (and)))
   // (or (or (or (and), (and)), (and)), (and))
   if (N0.getOpcode() != ISD::OR)
     return SDValue();
   SDValue N00 = N0.getOperand(0);
   SDValue N01 = N0.getOperand(1);
   SDNode *Parts[4] = {};
 
   if (N1.getOpcode() == ISD::OR &&
       N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
     // (or (or (and), (and)), (or (and), (and)))
     if (!isBSwapHWordElement(N00, Parts))
       return SDValue();
 
     if (!isBSwapHWordElement(N01, Parts))
       return SDValue();
     SDValue N10 = N1.getOperand(0);
     if (!isBSwapHWordElement(N10, Parts))
       return SDValue();
     SDValue N11 = N1.getOperand(1);
     if (!isBSwapHWordElement(N11, Parts))
       return SDValue();
   } else {
     // (or (or (or (and), (and)), (and)), (and))
     if (!isBSwapHWordElement(N1, Parts))
       return SDValue();
     if (!isBSwapHWordElement(N01, Parts))
       return SDValue();
     if (N00.getOpcode() != ISD::OR)
       return SDValue();
     SDValue N000 = N00.getOperand(0);
     if (!isBSwapHWordElement(N000, Parts))
       return SDValue();
     SDValue N001 = N00.getOperand(1);
     if (!isBSwapHWordElement(N001, Parts))
       return SDValue();
   }
 
   // Make sure the parts are all coming from the same node.
   if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
     return SDValue();
 
   SDLoc DL(N);
   SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT,
                               SDValue(Parts[0], 0));
 
   // Result of the bswap should be rotated by 16. If it's not legal, then
   // do  (x << 16) | (x >> 16).
   SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT));
   if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
     return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt);
   if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
     return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
   return DAG.getNode(ISD::OR, DL, VT,
                      DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt),
                      DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt));
 }
 
 /// This contains all DAGCombine rules which reduce two values combined by
 /// an Or operation to a single value \see visitANDLike().
 SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
   EVT VT = N1.getValueType();
   SDLoc DL(N);
 
   // fold (or x, undef) -> -1
   if (!LegalOperations && (N0.isUndef() || N1.isUndef()))
     return DAG.getAllOnesConstant(DL, VT);
 
   if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
     return V;
 
   // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
   if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
       // Don't increase # computations.
       (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
     // We can only do this xform if we know that bits from X that are set in C2
     // but not in C1 are already zero.  Likewise for Y.
     if (const ConstantSDNode *N0O1C =
         getAsNonOpaqueConstant(N0.getOperand(1))) {
       if (const ConstantSDNode *N1O1C =
           getAsNonOpaqueConstant(N1.getOperand(1))) {
         // We can only do this xform if we know that bits from X that are set in
         // C2 but not in C1 are already zero.  Likewise for Y.
         const APInt &LHSMask = N0O1C->getAPIntValue();
         const APInt &RHSMask = N1O1C->getAPIntValue();
 
         if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
             DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
           SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                                   N0.getOperand(0), N1.getOperand(0));
           return DAG.getNode(ISD::AND, DL, VT, X,
                              DAG.getConstant(LHSMask | RHSMask, DL, VT));
         }
       }
     }
   }
 
   // (or (and X, M), (and X, N)) -> (and X, (or M, N))
   if (N0.getOpcode() == ISD::AND &&
       N1.getOpcode() == ISD::AND &&
       N0.getOperand(0) == N1.getOperand(0) &&
       // Don't increase # computations.
       (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
     SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                             N0.getOperand(1), N1.getOperand(1));
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N1.getValueType();
 
   // x | x --> x
   if (N0 == N1)
     return N0;
 
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
     // fold (or x, 0) -> x, vector edition
     if (ISD::isBuildVectorAllZeros(N0.getNode()))
       return N1;
     if (ISD::isBuildVectorAllZeros(N1.getNode()))
       return N0;
 
     // fold (or x, -1) -> -1, vector edition
     if (ISD::isBuildVectorAllOnes(N0.getNode()))
       // do not return N0, because undef node may exist in N0
       return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType());
     if (ISD::isBuildVectorAllOnes(N1.getNode()))
       // do not return N1, because undef node may exist in N1
       return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());
 
     // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
     // Do this only if the resulting shuffle is legal.
     if (isa<ShuffleVectorSDNode>(N0) &&
         isa<ShuffleVectorSDNode>(N1) &&
         // Avoid folding a node with illegal type.
         TLI.isTypeLegal(VT)) {
       bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
       bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
       bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
       bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
       // Ensure both shuffles have a zero input.
       if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
         assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
         assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
         const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
         const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
         bool CanFold = true;
         int NumElts = VT.getVectorNumElements();
         SmallVector<int, 4> Mask(NumElts);
 
         for (int i = 0; i != NumElts; ++i) {
           int M0 = SV0->getMaskElt(i);
           int M1 = SV1->getMaskElt(i);
 
           // Determine if either index is pointing to a zero vector.
           bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts));
           bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts));
 
           // If one element is zero and the otherside is undef, keep undef.
           // This also handles the case that both are undef.
           if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0)) {
             Mask[i] = -1;
             continue;
           }
 
           // Make sure only one of the elements is zero.
           if (M0Zero == M1Zero) {
             CanFold = false;
             break;
           }
 
           assert((M0 >= 0 || M1 >= 0) && "Undef index!");
 
           // We have a zero and non-zero element. If the non-zero came from
           // SV0 make the index a LHS index. If it came from SV1, make it
           // a RHS index. We need to mod by NumElts because we don't care
           // which operand it came from in the original shuffles.
           Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts;
         }
 
         if (CanFold) {
           SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
           SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);
 
           bool LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
           if (!LegalMask) {
             std::swap(NewLHS, NewRHS);
             ShuffleVectorSDNode::commuteMask(Mask);
             LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
           }
 
           if (LegalMask)
             return DAG.getVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, Mask);
         }
       }
     }
   }
 
   // fold (or c1, c2) -> c1|c2
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, N0C, N1C);
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
   // fold (or x, 0) -> x
   if (isNullConstant(N1))
     return N0;
   // fold (or x, -1) -> -1
   if (isAllOnesConstant(N1))
     return N1;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // fold (or x, c) -> c iff (x & ~c) == 0
   if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
     return N1;
 
   if (SDValue Combined = visitORLike(N0, N1, N))
     return Combined;
 
   // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
   if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
     return BSwap;
   if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
     return BSwap;
 
   // reassociate or
   if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1))
     return ROR;
 
   // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
   // iff (c1 & c2) != 0.
   auto MatchIntersect = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
     return LHS->getAPIntValue().intersects(RHS->getAPIntValue());
   };
   if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
       ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect)) {
     if (SDValue COR = DAG.FoldConstantArithmetic(
             ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) {
       SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
       AddToWorklist(IOR.getNode());
       return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);
     }
   }
 
   // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
   if (N0.getOpcode() == N1.getOpcode())
     if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
       return Tmp;
 
   // See if this is some rotate idiom.
   if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
     return SDValue(Rot, 0);
 
   if (SDValue Load = MatchLoadCombine(N))
     return Load;
 
   // Simplify the operands using demanded-bits information.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
 static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) {
   if (Op.getOpcode() == ISD::AND &&
       DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
     Mask = Op.getOperand(1);
     return Op.getOperand(0);
   }
   return Op;
 }
 
 /// Match "(X shl/srl V1) & V2" where V2 may not be present.
 static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift,
                             SDValue &Mask) {
   Op = stripConstantMask(DAG, Op, Mask);
   if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) {
     Shift = Op;
     return true;
   }
   return false;
 }
 
 /// Helper function for visitOR to extract the needed side of a rotate idiom
 /// from a shl/srl/mul/udiv.  This is meant to handle cases where
 /// InstCombine merged some outside op with one of the shifts from
 /// the rotate pattern.
 /// \returns An empty \c SDValue if the needed shift couldn't be extracted.
 /// Otherwise, returns an expansion of \p ExtractFrom based on the following
 /// patterns:
 ///
 ///   (or (mul v c0) (shrl (mul v c1) c2)):
 ///     expands (mul v c0) -> (shl (mul v c1) c3)
 ///
 ///   (or (udiv v c0) (shl (udiv v c1) c2)):
 ///     expands (udiv v c0) -> (shrl (udiv v c1) c3)
 ///
 ///   (or (shl v c0) (shrl (shl v c1) c2)):
 ///     expands (shl v c0) -> (shl (shl v c1) c3)
 ///
 ///   (or (shrl v c0) (shl (shrl v c1) c2)):
 ///     expands (shrl v c0) -> (shrl (shrl v c1) c3)
 ///
 /// Such that in all cases, c3+c2==bitwidth(op v c1).
 static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
                                      SDValue ExtractFrom, SDValue &Mask,
                                      const SDLoc &DL) {
   assert(OppShift && ExtractFrom && "Empty SDValue");
   assert(
       (OppShift.getOpcode() == ISD::SHL || OppShift.getOpcode() == ISD::SRL) &&
       "Existing shift must be valid as a rotate half");
 
   ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask);
   // Preconditions:
   //    (or (op0 v c0) (shiftl/r (op0 v c1) c2))
   //
   // Find opcode of the needed shift to be extracted from (op0 v c0).
   unsigned Opcode = ISD::DELETED_NODE;
   bool IsMulOrDiv = false;
   // Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift
   // opcode or its arithmetic (mul or udiv) variant.
   auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) {
     IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant;
     if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift)
       return false;
     Opcode = NeededShift;
     return true;
   };
   // op0 must be either the needed shift opcode or the mul/udiv equivalent
   // that the needed shift can be extracted from.
   if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) &&
       (OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV)))
     return SDValue();
 
   // op0 must be the same opcode on both sides, have the same LHS argument,
   // and produce the same value type.
   SDValue OppShiftLHS = OppShift.getOperand(0);
   EVT ShiftedVT = OppShiftLHS.getValueType();
   if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() ||
       OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) ||
       ShiftedVT != ExtractFrom.getValueType())
     return SDValue();
 
   // Amount of the existing shift.
   ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
   // Constant mul/udiv/shift amount from the RHS of the shift's LHS op.
   ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1));
   // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op.
   ConstantSDNode *ExtractFromCst =
       isConstOrConstSplat(ExtractFrom.getOperand(1));
   // TODO: We should be able to handle non-uniform constant vectors for these values
   // Check that we have constant values.
   if (!OppShiftCst || !OppShiftCst->getAPIntValue() ||
       !OppLHSCst || !OppLHSCst->getAPIntValue() ||
       !ExtractFromCst || !ExtractFromCst->getAPIntValue())
     return SDValue();
 
   // Compute the shift amount we need to extract to complete the rotate.
   const unsigned VTWidth = ShiftedVT.getScalarSizeInBits();
   APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue();
   if (NeededShiftAmt.isNegative())
     return SDValue();
   // Normalize the bitwidth of the two mul/udiv/shift constant operands.
   APInt ExtractFromAmt = ExtractFromCst->getAPIntValue();
   APInt OppLHSAmt = OppLHSCst->getAPIntValue();
   zeroExtendToMatch(ExtractFromAmt, OppLHSAmt);
 
   // Now try extract the needed shift from the ExtractFrom op and see if the
   // result matches up with the existing shift's LHS op.
   if (IsMulOrDiv) {
     // Op to extract from is a mul or udiv by a constant.
     // Check:
     //     c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0
     //     c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0
     const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(),
                                                  NeededShiftAmt.getZExtValue());
     APInt ResultAmt;
     APInt Rem;
     APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem);
     if (Rem != 0 || ResultAmt != OppLHSAmt)
       return SDValue();
   } else {
     // Op to extract from is a shift by a constant.
     // Check:
     //      c2 - (bitwidth(op0 v c0) - c1) == c0
     if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc(
                                           ExtractFromAmt.getBitWidth()))
       return SDValue();
   }
 
   // Return the expanded shift op that should allow a rotate to be formed.
   EVT ShiftVT = OppShift.getOperand(1).getValueType();
   EVT ResVT = ExtractFrom.getValueType();
   SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT);
   return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode);
 }
 
 // Return true if we can prove that, whenever Neg and Pos are both in the
 // range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos).  This means that
 // for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
 //
 //     (or (shift1 X, Neg), (shift2 X, Pos))
 //
 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
 // in direction shift1 by Neg.  The range [0, EltSize) means that we only need
 // to consider shift amounts with defined behavior.
 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
                            SelectionDAG &DAG) {
   // If EltSize is a power of 2 then:
   //
   //  (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
   //  (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
   //
   // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
   // for the stronger condition:
   //
   //     Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1)    [A]
   //
   // for all Neg and Pos.  Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
   // we can just replace Neg with Neg' for the rest of the function.
   //
   // In other cases we check for the even stronger condition:
   //
   //     Neg == EltSize - Pos                                    [B]
   //
   // for all Neg and Pos.  Note that the (or ...) then invokes undefined
   // behavior if Pos == 0 (and consequently Neg == EltSize).
   //
   // We could actually use [A] whenever EltSize is a power of 2, but the
   // only extra cases that it would match are those uninteresting ones
   // where Neg and Pos are never in range at the same time.  E.g. for
   // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
   // as well as (sub 32, Pos), but:
   //
   //     (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
   //
   // always invokes undefined behavior for 32-bit X.
   //
   // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
   unsigned MaskLoBits = 0;
   if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
     if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
       KnownBits Known;
       DAG.computeKnownBits(Neg.getOperand(0), Known);
       unsigned Bits = Log2_64(EltSize);
       if (NegC->getAPIntValue().getActiveBits() <= Bits &&
           ((NegC->getAPIntValue() | Known.Zero).countTrailingOnes() >= Bits)) {
         Neg = Neg.getOperand(0);
         MaskLoBits = Bits;
       }
     }
   }
 
   // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
   if (Neg.getOpcode() != ISD::SUB)
     return false;
   ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
   if (!NegC)
     return false;
   SDValue NegOp1 = Neg.getOperand(1);
 
   // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
   // Pos'.  The truncation is redundant for the purpose of the equality.
   if (MaskLoBits && Pos.getOpcode() == ISD::AND) {
     if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1))) {
       KnownBits Known;
       DAG.computeKnownBits(Pos.getOperand(0), Known);
       if (PosC->getAPIntValue().getActiveBits() <= MaskLoBits &&
           ((PosC->getAPIntValue() | Known.Zero).countTrailingOnes() >=
            MaskLoBits))
         Pos = Pos.getOperand(0);
     }
   }
 
   // The condition we need is now:
   //
   //     (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
   //
   // If NegOp1 == Pos then we need:
   //
   //              EltSize & Mask == NegC & Mask
   //
   // (because "x & Mask" is a truncation and distributes through subtraction).
   APInt Width;
   if (Pos == NegOp1)
     Width = NegC->getAPIntValue();
 
   // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
   // Then the condition we want to prove becomes:
   //
   //     (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
   //
   // which, again because "x & Mask" is a truncation, becomes:
   //
   //                NegC & Mask == (EltSize - PosC) & Mask
   //             EltSize & Mask == (NegC + PosC) & Mask
   else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
     if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
       Width = PosC->getAPIntValue() + NegC->getAPIntValue();
     else
       return false;
   } else
     return false;
 
   // Now we just need to check that EltSize & Mask == Width & Mask.
   if (MaskLoBits)
     // EltSize & Mask is 0 since Mask is EltSize - 1.
     return Width.getLoBits(MaskLoBits) == 0;
   return Width == EltSize;
 }
 
 // A subroutine of MatchRotate used once we have found an OR of two opposite
 // shifts of Shifted.  If Neg == <operand size> - Pos then the OR reduces
 // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
 // former being preferred if supported.  InnerPos and InnerNeg are Pos and
 // Neg with outer conversions stripped away.
 SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
                                        SDValue Neg, SDValue InnerPos,
                                        SDValue InnerNeg, unsigned PosOpcode,
                                        unsigned NegOpcode, const SDLoc &DL) {
   // fold (or (shl x, (*ext y)),
   //          (srl x, (*ext (sub 32, y)))) ->
   //   (rotl x, y) or (rotr x, (sub 32, y))
   //
   // fold (or (shl x, (*ext (sub 32, y))),
   //          (srl x, (*ext y))) ->
   //   (rotr x, y) or (rotl x, (sub 32, y))
   EVT VT = Shifted.getValueType();
   if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
     bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
                        HasPos ? Pos : Neg).getNode();
   }
 
   return nullptr;
 }
 
 // MatchRotate - Handle an 'or' of two operands.  If this is one of the many
 // idioms for rotate, and if the target supports rotation instructions, generate
 // a rot[lr].
 SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
   EVT VT = LHS.getValueType();
   if (!TLI.isTypeLegal(VT)) return nullptr;
 
   // The target must have at least one rotate flavor.
   bool HasROTL = hasOperation(ISD::ROTL, VT);
   bool HasROTR = hasOperation(ISD::ROTR, VT);
   if (!HasROTL && !HasROTR) return nullptr;
 
   // Check for truncated rotate.
   if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
       LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
     assert(LHS.getValueType() == RHS.getValueType());
     if (SDNode *Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
       return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(),
                          SDValue(Rot, 0)).getNode();
     }
   }
 
   // Match "(X shl/srl V1) & V2" where V2 may not be present.
   SDValue LHSShift;   // The shift.
   SDValue LHSMask;    // AND value if any.
   matchRotateHalf(DAG, LHS, LHSShift, LHSMask);
 
   SDValue RHSShift;   // The shift.
   SDValue RHSMask;    // AND value if any.
   matchRotateHalf(DAG, RHS, RHSShift, RHSMask);
 
   // If neither side matched a rotate half, bail
   if (!LHSShift && !RHSShift)
     return nullptr;
 
   // InstCombine may have combined a constant shl, srl, mul, or udiv with one
   // side of the rotate, so try to handle that here. In all cases we need to
   // pass the matched shift from the opposite side to compute the opcode and
   // needed shift amount to extract.  We still want to do this if both sides
   // matched a rotate half because one half may be a potential overshift that
   // can be broken down (ie if InstCombine merged two shl or srl ops into a
   // single one).
 
   // Have LHS side of the rotate, try to extract the needed shift from the RHS.
   if (LHSShift)
     if (SDValue NewRHSShift =
             extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL))
       RHSShift = NewRHSShift;
   // Have RHS side of the rotate, try to extract the needed shift from the LHS.
   if (RHSShift)
     if (SDValue NewLHSShift =
             extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL))
       LHSShift = NewLHSShift;
 
   // If a side is still missing, nothing else we can do.
   if (!RHSShift || !LHSShift)
     return nullptr;
 
   // At this point we've matched or extracted a shift op on each side.
 
   if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
     return nullptr;   // Not shifting the same value.
 
   if (LHSShift.getOpcode() == RHSShift.getOpcode())
     return nullptr;   // Shifts must disagree.
 
   // Canonicalize shl to left side in a shl/srl pair.
   if (RHSShift.getOpcode() == ISD::SHL) {
     std::swap(LHS, RHS);
     std::swap(LHSShift, RHSShift);
     std::swap(LHSMask, RHSMask);
   }
 
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   SDValue LHSShiftArg = LHSShift.getOperand(0);
   SDValue LHSShiftAmt = LHSShift.getOperand(1);
   SDValue RHSShiftArg = RHSShift.getOperand(0);
   SDValue RHSShiftAmt = RHSShift.getOperand(1);
 
   // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
   // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
   auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
                                         ConstantSDNode *RHS) {
     return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
   };
   if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
     SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
                               LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
 
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
       SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
       SDValue Mask = AllOnes;
 
       if (LHSMask.getNode()) {
         SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
         Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
                            DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
       }
       if (RHSMask.getNode()) {
         SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
         Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
                            DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
       }
 
       Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
     }
 
     return Rot.getNode();
   }
 
   // If there is a mask here, and we have a variable shift, we can't be sure
   // that we're masking out the right stuff.
   if (LHSMask.getNode() || RHSMask.getNode())
     return nullptr;
 
   // If the shift amount is sign/zext/any-extended just peel it off.
   SDValue LExtOp0 = LHSShiftAmt;
   SDValue RExtOp0 = RHSShiftAmt;
   if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
        LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
        LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
        LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
       (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
        RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
        RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
        RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
     LExtOp0 = LHSShiftAmt.getOperand(0);
     RExtOp0 = RHSShiftAmt.getOperand(0);
   }
 
   SDNode *TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
                                    LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);
   if (TryL)
     return TryL;
 
   SDNode *TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
                                    RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);
   if (TryR)
     return TryR;
 
   return nullptr;
 }
 
 namespace {
 
 /// Represents known origin of an individual byte in load combine pattern. The
 /// value of the byte is either constant zero or comes from memory.
 struct ByteProvider {
   // For constant zero providers Load is set to nullptr. For memory providers
   // Load represents the node which loads the byte from memory.
   // ByteOffset is the offset of the byte in the value produced by the load.
   LoadSDNode *Load = nullptr;
   unsigned ByteOffset = 0;
 
   ByteProvider() = default;
 
   static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
     return ByteProvider(Load, ByteOffset);
   }
 
   static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }
 
   bool isConstantZero() const { return !Load; }
   bool isMemory() const { return Load; }
 
   bool operator==(const ByteProvider &Other) const {
     return Other.Load == Load && Other.ByteOffset == ByteOffset;
   }
 
 private:
   ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
       : Load(Load), ByteOffset(ByteOffset) {}
 };
 
 } // end anonymous namespace
 
 /// Recursively traverses the expression calculating the origin of the requested
 /// byte of the given value. Returns None if the provider can't be calculated.
 ///
 /// For all the values except the root of the expression verifies that the value
 /// has exactly one use and if it's not true return None. This way if the origin
 /// of the byte is returned it's guaranteed that the values which contribute to
 /// the byte are not used outside of this expression.
 ///
 /// Because the parts of the expression are not allowed to have more than one
 /// use this function iterates over trees, not DAGs. So it never visits the same
 /// node more than once.
 static const Optional<ByteProvider>
 calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
                       bool Root = false) {
   // Typical i64 by i8 pattern requires recursion up to 8 calls depth
   if (Depth == 10)
     return None;
 
   if (!Root && !Op.hasOneUse())
     return None;
 
   assert(Op.getValueType().isScalarInteger() && "can't handle other types");
   unsigned BitWidth = Op.getValueSizeInBits();
   if (BitWidth % 8 != 0)
     return None;
   unsigned ByteWidth = BitWidth / 8;
   assert(Index < ByteWidth && "invalid index requested");
   (void) ByteWidth;
 
   switch (Op.getOpcode()) {
   case ISD::OR: {
     auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
     if (!LHS)
       return None;
     auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
     if (!RHS)
       return None;
 
     if (LHS->isConstantZero())
       return RHS;
     if (RHS->isConstantZero())
       return LHS;
     return None;
   }
   case ISD::SHL: {
     auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
     if (!ShiftOp)
       return None;
 
     uint64_t BitShift = ShiftOp->getZExtValue();
     if (BitShift % 8 != 0)
       return None;
     uint64_t ByteShift = BitShift / 8;
 
     return Index < ByteShift
                ? ByteProvider::getConstantZero()
                : calculateByteProvider(Op->getOperand(0), Index - ByteShift,
                                        Depth + 1);
   }
   case ISD::ANY_EXTEND:
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND: {
     SDValue NarrowOp = Op->getOperand(0);
     unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
     if (NarrowBitWidth % 8 != 0)
       return None;
     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
 
     if (Index >= NarrowByteWidth)
       return Op.getOpcode() == ISD::ZERO_EXTEND
                  ? Optional<ByteProvider>(ByteProvider::getConstantZero())
                  : None;
     return calculateByteProvider(NarrowOp, Index, Depth + 1);
   }
   case ISD::BSWAP:
     return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
                                  Depth + 1);
   case ISD::LOAD: {
     auto L = cast<LoadSDNode>(Op.getNode());
     if (L->isVolatile() || L->isIndexed())
       return None;
 
     unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
     if (NarrowBitWidth % 8 != 0)
       return None;
     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
 
     if (Index >= NarrowByteWidth)
       return L->getExtensionType() == ISD::ZEXTLOAD
                  ? Optional<ByteProvider>(ByteProvider::getConstantZero())
                  : None;
     return ByteProvider::getMemory(L, Index);
   }
   }
 
   return None;
 }
 
 /// Match a pattern where a wide type scalar value is loaded by several narrow
 /// loads and combined by shifts and ors. Fold it into a single load or a load
 /// and a BSWAP if the targets supports it.
 ///
 /// Assuming little endian target:
 ///  i8 *a = ...
 ///  i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
 /// =>
 ///  i32 val = *((i32)a)
 ///
 ///  i8 *a = ...
 ///  i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
 /// =>
 ///  i32 val = BSWAP(*((i32)a))
 ///
 /// TODO: This rule matches complex patterns with OR node roots and doesn't
 /// interact well with the worklist mechanism. When a part of the pattern is
 /// updated (e.g. one of the loads) its direct users are put into the worklist,
 /// but the root node of the pattern which triggers the load combine is not
 /// necessarily a direct user of the changed node. For example, once the address
 /// of t28 load is reassociated load combine won't be triggered:
 ///             t25: i32 = add t4, Constant:i32<2>
 ///           t26: i64 = sign_extend t25
 ///        t27: i64 = add t2, t26
 ///       t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
 ///     t29: i32 = zero_extend t28
 ///   t32: i32 = shl t29, Constant:i8<8>
 /// t33: i32 = or t23, t32
 /// As a possible fix visitLoad can check if the load can be a part of a load
 /// combine pattern and add corresponding OR roots to the worklist.
 SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
   assert(N->getOpcode() == ISD::OR &&
          "Can only match load combining against OR nodes");
 
   // Handles simple types only
   EVT VT = N->getValueType(0);
   if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
   unsigned ByteWidth = VT.getSizeInBits() / 8;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   // Before legalize we can introduce too wide illegal loads which will be later
   // split into legal sized loads. This enables us to combine i64 load by i8
   // patterns to a couple of i32 loads on 32 bit targets.
   if (LegalOperations && !TLI.isOperationLegal(ISD::LOAD, VT))
     return SDValue();
 
   std::function<unsigned(unsigned, unsigned)> LittleEndianByteAt = [](
     unsigned BW, unsigned i) { return i; };
   std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = [](
     unsigned BW, unsigned i) { return BW - i - 1; };
 
   bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
   auto MemoryByteOffset = [&] (ByteProvider P) {
     assert(P.isMemory() && "Must be a memory byte provider");
     unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
     assert(LoadBitWidth % 8 == 0 &&
            "can only analyze providers for individual bytes not bit");
     unsigned LoadByteWidth = LoadBitWidth / 8;
     return IsBigEndianTarget
             ? BigEndianByteAt(LoadByteWidth, P.ByteOffset)
             : LittleEndianByteAt(LoadByteWidth, P.ByteOffset);
   };
 
   Optional<BaseIndexOffset> Base;
   SDValue Chain;
 
   SmallPtrSet<LoadSDNode *, 8> Loads;
   Optional<ByteProvider> FirstByteProvider;
   int64_t FirstOffset = INT64_MAX;
 
   // Check if all the bytes of the OR we are looking at are loaded from the same
   // base address. Collect bytes offsets from Base address in ByteOffsets.
   SmallVector<int64_t, 4> ByteOffsets(ByteWidth);
   for (unsigned i = 0; i < ByteWidth; i++) {
     auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true);
     if (!P || !P->isMemory()) // All the bytes must be loaded from memory
       return SDValue();
 
     LoadSDNode *L = P->Load;
     assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
            "Must be enforced by calculateByteProvider");
     assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
 
     // All loads must share the same chain
     SDValue LChain = L->getChain();
     if (!Chain)
       Chain = LChain;
     else if (Chain != LChain)
       return SDValue();
 
     // Loads must share the same base address
     BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
     int64_t ByteOffsetFromBase = 0;
     if (!Base)
       Base = Ptr;
     else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
       return SDValue();
 
     // Calculate the offset of the current byte from the base address
     ByteOffsetFromBase += MemoryByteOffset(*P);
     ByteOffsets[i] = ByteOffsetFromBase;
 
     // Remember the first byte load
     if (ByteOffsetFromBase < FirstOffset) {
       FirstByteProvider = P;
       FirstOffset = ByteOffsetFromBase;
     }
 
     Loads.insert(L);
   }
   assert(!Loads.empty() && "All the bytes of the value must be loaded from "
          "memory, so there must be at least one load which produces the value");
   assert(Base && "Base address of the accessed memory location must be set");
   assert(FirstOffset != INT64_MAX && "First byte offset must be set");
 
   // Check if the bytes of the OR we are looking at match with either big or
   // little endian value load
   bool BigEndian = true, LittleEndian = true;
   for (unsigned i = 0; i < ByteWidth; i++) {
     int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
     LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i);
     BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i);
     if (!BigEndian && !LittleEndian)
       return SDValue();
   }
   assert((BigEndian != LittleEndian) && "should be either or");
   assert(FirstByteProvider && "must be set");
 
   // Ensure that the first byte is loaded from zero offset of the first load.
   // So the combined value can be loaded from the first load address.
   if (MemoryByteOffset(*FirstByteProvider) != 0)
     return SDValue();
   LoadSDNode *FirstLoad = FirstByteProvider->Load;
 
   // The node we are looking at matches with the pattern, check if we can
   // replace it with a single load and bswap if needed.
 
   // If the load needs byte swap check if the target supports it
   bool NeedsBswap = IsBigEndianTarget != BigEndian;
 
   // Before legalize we can introduce illegal bswaps which will be later
   // converted to an explicit bswap sequence. This way we end up with a single
   // load and byte shuffling instead of several loads and byte shuffling.
   if (NeedsBswap && LegalOperations && !TLI.isOperationLegal(ISD::BSWAP, VT))
     return SDValue();
 
   // Check that a load of the wide type is both allowed and fast on the target
   bool Fast = false;
   bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
                                         VT, FirstLoad->getAddressSpace(),
                                         FirstLoad->getAlignment(), &Fast);
   if (!Allowed || !Fast)
     return SDValue();
 
   SDValue NewLoad =
       DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
                   FirstLoad->getPointerInfo(), FirstLoad->getAlignment());
 
   // Transfer chain users from old loads to the new load.
   for (LoadSDNode *L : Loads)
     DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
 
   return NeedsBswap ? DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad) : NewLoad;
 }
 
 // If the target has andn, bsl, or a similar bit-select instruction,
 // we want to unfold masked merge, with canonical pattern of:
 //   |        A  |  |B|
 //   ((x ^ y) & m) ^ y
 //    |  D  |
 // Into:
 //   (x & m) | (y & ~m)
 // If y is a constant, and the 'andn' does not work with immediates,
 // we unfold into a different pattern:
 //   ~(~x & m) & (m | y)
 // NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at
 //       the very least that breaks andnpd / andnps patterns, and because those
 //       patterns are simplified in IR and shouldn't be created in the DAG
 SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
   assert(N->getOpcode() == ISD::XOR);
 
   // Don't touch 'not' (i.e. where y = -1).
   if (isAllOnesConstantOrAllOnesSplatConstant(N->getOperand(1)))
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   // There are 3 commutable operators in the pattern,
   // so we have to deal with 8 possible variants of the basic pattern.
   SDValue X, Y, M;
   auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) {
     if (And.getOpcode() != ISD::AND || !And.hasOneUse())
       return false;
     SDValue Xor = And.getOperand(XorIdx);
     if (Xor.getOpcode() != ISD::XOR || !Xor.hasOneUse())
       return false;
     SDValue Xor0 = Xor.getOperand(0);
     SDValue Xor1 = Xor.getOperand(1);
     // Don't touch 'not' (i.e. where y = -1).
     if (isAllOnesConstantOrAllOnesSplatConstant(Xor1))
       return false;
     if (Other == Xor0)
       std::swap(Xor0, Xor1);
     if (Other != Xor1)
       return false;
     X = Xor0;
     Y = Xor1;
     M = And.getOperand(XorIdx ? 0 : 1);
     return true;
   };
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) &&
       !matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0))
     return SDValue();
 
   // Don't do anything if the mask is constant. This should not be reachable.
   // InstCombine should have already unfolded this pattern, and DAGCombiner
   // probably shouldn't produce it, too.
   if (isa<ConstantSDNode>(M.getNode()))
     return SDValue();
 
   // We can transform if the target has AndNot
   if (!TLI.hasAndNot(M))
     return SDValue();
 
   SDLoc DL(N);
 
   // If Y is a constant, check that 'andn' works with immediates.
   if (!TLI.hasAndNot(Y)) {
     assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable.");
     // If not, we need to do a bit more work to make sure andn is still used.
     SDValue NotX = DAG.getNOT(DL, X, VT);
     SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M);
     SDValue NotLHS = DAG.getNOT(DL, LHS, VT);
     SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y);
     return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS);
   }
 
   SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M);
   SDValue NotM = DAG.getNOT(DL, M, VT);
   SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM);
 
   return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
 }
 
 SDValue DAGCombiner::visitXOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
 
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
     // fold (xor x, 0) -> x, vector edition
     if (ISD::isBuildVectorAllZeros(N0.getNode()))
       return N1;
     if (ISD::isBuildVectorAllZeros(N1.getNode()))
       return N0;
   }
 
   // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
   if (N0.isUndef() && N1.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
   // fold (xor x, undef) -> undef
   if (N0.isUndef())
     return N0;
   if (N1.isUndef())
     return N1;
   // fold (xor c1, c2) -> c1^c2
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::XOR, SDLoc(N), VT, N0C, N1C);
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0);
   // fold (xor x, 0) -> x
   if (isNullConstant(N1))
     return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // reassociate xor
   if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1))
     return RXOR;
 
   // fold !(x cc y) -> (x !cc y)
   SDValue LHS, RHS, CC;
   if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {
     bool isInt = LHS.getValueType().isInteger();
     ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
                                                isInt);
 
     if (!LegalOperations ||
         TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
       switch (N0.getOpcode()) {
       default:
         llvm_unreachable("Unhandled SetCC Equivalent!");
       case ISD::SETCC:
         return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
       case ISD::SELECT_CC:
         return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
                                N0.getOperand(3), NotCC);
       }
     }
   }
 
   // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
   if (isOneConstant(N1) && N0.getOpcode() == ISD::ZERO_EXTEND &&
       N0.getNode()->hasOneUse() &&
       isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
     SDValue V = N0.getOperand(0);
     SDLoc DL(N0);
     V = DAG.getNode(ISD::XOR, DL, V.getValueType(), V,
                     DAG.getConstant(1, DL, V.getValueType()));
     AddToWorklist(V.getNode());
     return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V);
   }
 
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
   if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
       (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
     SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
     if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) {
       unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
       LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
       RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
       AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
       return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
     }
   }
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
   if (isAllOnesConstant(N1) && N0.hasOneUse() &&
       (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
     SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
     if (isa<ConstantSDNode>(RHS) || isa<ConstantSDNode>(LHS)) {
       unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
       LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
       RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
       AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
       return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
     }
   }
   // fold (xor (and x, y), y) -> (and (not x), y)
   if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
       N0->getOperand(1) == N1) {
     SDValue X = N0->getOperand(0);
     SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
     AddToWorklist(NotX.getNode());
     return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1);
   }
 
   // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
   if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
     SDValue A = N0.getOpcode() == ISD::ADD ? N0 : N1;
     SDValue S = N0.getOpcode() == ISD::SRA ? N0 : N1;
     if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) {
       SDValue A0 = A.getOperand(0), A1 = A.getOperand(1);
       SDValue S0 = S.getOperand(0);
       if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0)) {
         unsigned OpSizeInBits = VT.getScalarSizeInBits();
         if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1)))
           if (C->getAPIntValue() == (OpSizeInBits - 1))
             return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
       }
     }
   }
 
   // fold (xor x, x) -> 0
   if (N0 == N1)
     return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
 
   // fold (xor (shl 1, x), -1) -> (rotl ~1, x)
   // Here is a concrete example of this equivalence:
   // i16   x ==  14
   // i16 shl ==   1 << 14  == 16384 == 0b0100000000000000
   // i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111
   //
   // =>
   //
   // i16     ~1      == 0b1111111111111110
   // i16 rol(~1, 14) == 0b1011111111111111
   //
   // Some additional tips to help conceptualize this transform:
   // - Try to see the operation as placing a single zero in a value of all ones.
   // - There exists no value for x which would allow the result to contain zero.
   // - Values of x larger than the bitwidth are undefined and do not require a
   //   consistent result.
   // - Pushing the zero left requires shifting one bits in from the right.
   // A rotate left of ~1 is a nice way of achieving the desired result.
   if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0.getOpcode() == ISD::SHL
       && isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
     SDLoc DL(N);
     return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT),
                        N0.getOperand(1));
   }
 
   // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
   if (N0.getOpcode() == N1.getOpcode())
     if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
       return Tmp;
 
   // Unfold  ((x ^ y) & m) ^ y  into  (x & m) | (y & ~m)  if profitable
   if (SDValue MM = unfoldMaskedMerge(N))
     return MM;
 
   // Simplify the expression using non-local knowledge.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
 /// Handle transforms common to the three shifts, when the shift amount is a
 /// constant.
 SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
   SDNode *LHS = N->getOperand(0).getNode();
   if (!LHS->hasOneUse()) return SDValue();
 
   // We want to pull some binops through shifts, so that we have (and (shift))
   // instead of (shift (and)), likewise for add, or, xor, etc.  This sort of
   // thing happens with address calculations, so it's important to canonicalize
   // it.
   bool HighBitSet = false;  // Can we transform this if the high bit is set?
 
   switch (LHS->getOpcode()) {
   default: return SDValue();
   case ISD::OR:
   case ISD::XOR:
     HighBitSet = false; // We can only transform sra if the high bit is clear.
     break;
   case ISD::AND:
     HighBitSet = true;  // We can only transform sra if the high bit is set.
     break;
   case ISD::ADD:
     if (N->getOpcode() != ISD::SHL)
       return SDValue(); // only shl(add) not sr[al](add).
     HighBitSet = false; // We can only transform sra if the high bit is clear.
     break;
   }
 
   // We require the RHS of the binop to be a constant and not opaque as well.
   ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS->getOperand(1));
   if (!BinOpCst) return SDValue();
 
   // FIXME: disable this unless the input to the binop is a shift by a constant
   // or is copy/select.Enable this in other cases when figure out it's exactly profitable.
   SDNode *BinOpLHSVal = LHS->getOperand(0).getNode();
   bool isShift = BinOpLHSVal->getOpcode() == ISD::SHL ||
                  BinOpLHSVal->getOpcode() == ISD::SRA ||
                  BinOpLHSVal->getOpcode() == ISD::SRL;
   bool isCopyOrSelect = BinOpLHSVal->getOpcode() == ISD::CopyFromReg ||
                         BinOpLHSVal->getOpcode() == ISD::SELECT;
 
   if ((!isShift || !isa<ConstantSDNode>(BinOpLHSVal->getOperand(1))) &&
       !isCopyOrSelect)
     return SDValue();
 
   if (isCopyOrSelect && N->hasOneUse())
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   // If this is a signed shift right, and the high bit is modified by the
   // logical operation, do not perform the transformation. The highBitSet
   // boolean indicates the value of the high bit of the constant which would
   // cause it to be modified for this operation.
   if (N->getOpcode() == ISD::SRA) {
     bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative();
     if (BinOpRHSSignSet != HighBitSet)
       return SDValue();
   }
 
-  if (!TLI.isDesirableToCommuteWithShift(LHS))
+  if (!TLI.isDesirableToCommuteWithShift(N, Level))
     return SDValue();
 
   // Fold the constants, shifting the binop RHS by the shift amount.
   SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)),
                                N->getValueType(0),
                                LHS->getOperand(1), N->getOperand(1));
   assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!");
 
   // Create the new shift.
   SDValue NewShift = DAG.getNode(N->getOpcode(),
                                  SDLoc(LHS->getOperand(0)),
                                  VT, LHS->getOperand(0), N->getOperand(1));
 
   // Create the new binop.
   return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS);
 }
 
 SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
   assert(N->getOpcode() == ISD::TRUNCATE);
   assert(N->getOperand(0).getOpcode() == ISD::AND);
 
   // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC)
   if (N->hasOneUse() && N->getOperand(0).hasOneUse()) {
     SDValue N01 = N->getOperand(0).getOperand(1);
     if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) {
       SDLoc DL(N);
       EVT TruncVT = N->getValueType(0);
       SDValue N00 = N->getOperand(0).getOperand(0);
       SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00);
       SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01);
       AddToWorklist(Trunc00.getNode());
       AddToWorklist(Trunc01.getNode());
       return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01);
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitRotate(SDNode *N) {
   SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   unsigned Bitsize = VT.getScalarSizeInBits();
 
   // fold (rot x, 0) -> x
   if (isNullConstantOrNullSplatConstant(N1))
     return N0;
 
   // fold (rot x, c) -> (rot x, c % BitSize)
   if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) {
     if (Cst->getAPIntValue().uge(Bitsize)) {
       uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize);
       return DAG.getNode(N->getOpcode(), dl, VT, N0,
                          DAG.getConstant(RotAmt, dl, N1.getValueType()));
     }
   }
 
   // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
     if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
       return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1);
   }
 
   unsigned NextOp = N0.getOpcode();
   // fold (rot* (rot* x, c2), c1) -> (rot* x, c1 +- c2 % bitsize)
   if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
     SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
     SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
     if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
       EVT ShiftVT = C1->getValueType(0);
       bool SameSide = (N->getOpcode() == NextOp);
       unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
       if (SDValue CombinedShift =
               DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) {
         SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
         SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
             ISD::SREM, dl, ShiftVT, CombinedShift.getNode(),
             BitsizeC.getNode());
         return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
                            CombinedShiftNorm);
       }
     }
   }
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSHL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
     BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
     // If setcc produces all-one true value then:
     // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV)
     if (N1CV && N1CV->isConstant()) {
       if (N0.getOpcode() == ISD::AND) {
         SDValue N00 = N0->getOperand(0);
         SDValue N01 = N0->getOperand(1);
         BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01);
 
         if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
             TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
                 TargetLowering::ZeroOrNegativeOneBooleanContent) {
           if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT,
                                                      N01CV, N1CV))
             return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
         }
       }
     }
   }
 
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // fold (shl c1, c2) -> c1<<c2
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
   // fold (shl 0, x) -> 0
   if (isNullConstantOrNullSplatConstant(N0))
     return N0;
   // fold (shl x, c >= size(x)) -> undef
   // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
   auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
     return Val->getAPIntValue().uge(OpSizeInBits);
   };
   if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
     return DAG.getUNDEF(VT);
   // fold (shl x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
   // fold (shl undef, x) -> 0
   if (N0.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // if (shl x, c) is known to be zero, return 0
   if (DAG.MaskedValueIsZero(SDValue(N, 0),
                             APInt::getAllOnesValue(OpSizeInBits)))
     return DAG.getConstant(0, SDLoc(N), VT);
   // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
     if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
       return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
   }
 
   if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
   if (N0.getOpcode() == ISD::SHL) {
     auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
                                           ConstantSDNode *RHS) {
       APInt c1 = LHS->getAPIntValue();
       APInt c2 = RHS->getAPIntValue();
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).uge(OpSizeInBits);
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
       return DAG.getConstant(0, SDLoc(N), VT);
 
     auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
                                        ConstantSDNode *RHS) {
       APInt c1 = LHS->getAPIntValue();
       APInt c2 = RHS->getAPIntValue();
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).ult(OpSizeInBits);
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
       SDLoc DL(N);
       EVT ShiftVT = N1.getValueType();
       SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
       return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);
     }
   }
 
   // fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2)))
   // For this to be valid, the second form must not preserve any of the bits
   // that are shifted out by the inner shift in the first form.  This means
   // the outer shift size must be >= the number of bits added by the ext.
   // As a corollary, we don't care what kind of ext it is.
   if (N1C && (N0.getOpcode() == ISD::ZERO_EXTEND ||
               N0.getOpcode() == ISD::ANY_EXTEND ||
               N0.getOpcode() == ISD::SIGN_EXTEND) &&
       N0.getOperand(0).getOpcode() == ISD::SHL) {
     SDValue N0Op0 = N0.getOperand(0);
     if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) {
       APInt c1 = N0Op0C1->getAPIntValue();
       APInt c2 = N1C->getAPIntValue();
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
 
       EVT InnerShiftVT = N0Op0.getValueType();
       uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
       if (c2.uge(OpSizeInBits - InnerShiftSize)) {
         SDLoc DL(N0);
         APInt Sum = c1 + c2;
         if (Sum.uge(OpSizeInBits))
           return DAG.getConstant(0, DL, VT);
 
         return DAG.getNode(
             ISD::SHL, DL, VT,
             DAG.getNode(N0.getOpcode(), DL, VT, N0Op0->getOperand(0)),
             DAG.getConstant(Sum.getZExtValue(), DL, N1.getValueType()));
       }
     }
   }
 
   // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
   // Only fold this if the inner zext has no other uses to avoid increasing
   // the total number of instructions.
   if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
       N0.getOperand(0).getOpcode() == ISD::SRL) {
     SDValue N0Op0 = N0.getOperand(0);
     if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) {
       if (N0Op0C1->getAPIntValue().ult(VT.getScalarSizeInBits())) {
         uint64_t c1 = N0Op0C1->getZExtValue();
         uint64_t c2 = N1C->getZExtValue();
         if (c1 == c2) {
           SDValue NewOp0 = N0.getOperand(0);
           EVT CountVT = NewOp0.getOperand(1).getValueType();
           SDLoc DL(N);
           SDValue NewSHL = DAG.getNode(ISD::SHL, DL, NewOp0.getValueType(),
                                        NewOp0,
                                        DAG.getConstant(c2, DL, CountVT));
           AddToWorklist(NewSHL.getNode());
           return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
         }
       }
     }
   }
 
   // fold (shl (sr[la] exact X,  C1), C2) -> (shl    X, (C2-C1)) if C1 <= C2
   // fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1  > C2
   if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) &&
       N0->getFlags().hasExact()) {
     if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
       uint64_t C1 = N0C1->getZExtValue();
       uint64_t C2 = N1C->getZExtValue();
       SDLoc DL(N);
       if (C1 <= C2)
         return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
                            DAG.getConstant(C2 - C1, DL, N1.getValueType()));
       return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0),
                          DAG.getConstant(C1 - C2, DL, N1.getValueType()));
     }
   }
 
   // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
   //                               (and (srl x, (sub c1, c2), MASK)
   // Only fold this if the inner shift has no other uses -- if it does, folding
   // this will increase the total number of instructions.
   if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
     if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
       uint64_t c1 = N0C1->getZExtValue();
       if (c1 < OpSizeInBits) {
         uint64_t c2 = N1C->getZExtValue();
         APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1);
         SDValue Shift;
         if (c2 > c1) {
           Mask <<= c2 - c1;
           SDLoc DL(N);
           Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
                               DAG.getConstant(c2 - c1, DL, N1.getValueType()));
         } else {
           Mask.lshrInPlace(c1 - c2);
           SDLoc DL(N);
           Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
                               DAG.getConstant(c1 - c2, DL, N1.getValueType()));
         }
         SDLoc DL(N0);
         return DAG.getNode(ISD::AND, DL, VT, Shift,
                            DAG.getConstant(Mask, DL, VT));
       }
     }
   }
 
   // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
   if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
       isConstantOrConstantVector(N1, /* No Opaques */ true)) {
     SDLoc DL(N);
     SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
     SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
   }
 
   // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
   // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
   // Variant of version done on multiply, except mul by a power of 2 is turned
   // into a shift.
   if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) &&
       N0.getNode()->hasOneUse() &&
       isConstantOrConstantVector(N1, /* No Opaques */ true) &&
-      isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
+      isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) &&
+      TLI.isDesirableToCommuteWithShift(N, Level)) {
     SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
     SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
     AddToWorklist(Shl0.getNode());
     AddToWorklist(Shl1.getNode());
     return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
   }
 
   // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
   if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&
       isConstantOrConstantVector(N1, /* No Opaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
     SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
     if (isConstantOrConstantVector(Shl))
       return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
   }
 
   if (N1C && !N1C->isOpaque())
     if (SDValue NewSHL = visitShiftByConstant(N, N1C))
       return NewSHL;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSRA(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
   // Arithmetic shifting an all-sign-bit value is a no-op.
   // fold (sra 0, x) -> 0
   // fold (sra -1, x) -> -1
   if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
     return N0;
 
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // fold (sra c1, c2) -> (sra c1, c2)
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
   // fold (sra x, c >= size(x)) -> undef
   // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
   auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
     return Val->getAPIntValue().uge(OpSizeInBits);
   };
   if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
     return DAG.getUNDEF(VT);
   // fold (sra x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
   // sext_inreg.
   if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
     unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
     EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
     if (VT.isVector())
       ExtVT = EVT::getVectorVT(*DAG.getContext(),
                                ExtVT, VT.getVectorNumElements());
     if ((!LegalOperations ||
          TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT)))
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
                          N0.getOperand(0), DAG.getValueType(ExtVT));
   }
 
   // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
   if (N0.getOpcode() == ISD::SRA) {
     SDLoc DL(N);
     EVT ShiftVT = N1.getValueType();
 
     auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
                                           ConstantSDNode *RHS) {
       APInt c1 = LHS->getAPIntValue();
       APInt c2 = RHS->getAPIntValue();
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).uge(OpSizeInBits);
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
       return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0),
                          DAG.getConstant(OpSizeInBits - 1, DL, ShiftVT));
 
     auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
                                        ConstantSDNode *RHS) {
       APInt c1 = LHS->getAPIntValue();
       APInt c2 = RHS->getAPIntValue();
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).ult(OpSizeInBits);
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
       SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
       return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), Sum);
     }
   }
 
   // fold (sra (shl X, m), (sub result_size, n))
   // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
   // result_size - n != m.
   // If truncate is free for the target sext(shl) is likely to result in better
   // code.
   if (N0.getOpcode() == ISD::SHL && N1C) {
     // Get the two constanst of the shifts, CN0 = m, CN = n.
     const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1));
     if (N01C) {
       LLVMContext &Ctx = *DAG.getContext();
       // Determine what the truncate's result bitsize and type would be.
       EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue());
 
       if (VT.isVector())
         TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());
 
       // Determine the residual right-shift amount.
       int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
 
       // If the shift is not a no-op (in which case this should be just a sign
       // extend already), the truncated to type is legal, sign_extend is legal
       // on that type, and the truncate to that type is both legal and free,
       // perform the transform.
       if ((ShiftAmt > 0) &&
           TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
           TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
           TLI.isTruncateFree(VT, TruncVT)) {
         SDLoc DL(N);
         SDValue Amt = DAG.getConstant(ShiftAmt, DL,
             getShiftAmountTy(N0.getOperand(0).getValueType()));
         SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
                                     N0.getOperand(0), Amt);
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT,
                                     Shift);
         return DAG.getNode(ISD::SIGN_EXTEND, DL,
                            N->getValueType(0), Trunc);
       }
     }
   }
 
   // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
     if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
       return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
   }
 
   // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
   //      if c1 is equal to the number of bits the trunc removes
   if (N0.getOpcode() == ISD::TRUNCATE &&
       (N0.getOperand(0).getOpcode() == ISD::SRL ||
        N0.getOperand(0).getOpcode() == ISD::SRA) &&
       N0.getOperand(0).hasOneUse() &&
       N0.getOperand(0).getOperand(1).hasOneUse() &&
       N1C) {
     SDValue N0Op0 = N0.getOperand(0);
     if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) {
       unsigned LargeShiftVal = LargeShift->getZExtValue();
       EVT LargeVT = N0Op0.getValueType();
 
       if (LargeVT.getScalarSizeInBits() - OpSizeInBits == LargeShiftVal) {
         SDLoc DL(N);
         SDValue Amt =
           DAG.getConstant(LargeShiftVal + N1C->getZExtValue(), DL,
                           getShiftAmountTy(N0Op0.getOperand(0).getValueType()));
         SDValue SRA = DAG.getNode(ISD::SRA, DL, LargeVT,
                                   N0Op0.getOperand(0), Amt);
         return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA);
       }
     }
   }
 
   // Simplify, based on bits shifted out of the LHS.
   if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // If the sign bit is known to be zero, switch this to a SRL.
   if (DAG.SignBitIsZero(N0))
     return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
 
   if (N1C && !N1C->isOpaque())
     if (SDValue NewSRA = visitShiftByConstant(N, N1C))
       return NewSRA;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSRL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // fold (srl c1, c2) -> c1 >>u c2
   ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   if (N0C && N1C && !N1C->isOpaque())
     return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
   // fold (srl 0, x) -> 0
   if (isNullConstantOrNullSplatConstant(N0))
     return N0;
   // fold (srl x, c >= size(x)) -> undef
   // NOTE: ALL vector elements must be too big to avoid partial UNDEFs.
   auto MatchShiftTooBig = [OpSizeInBits](ConstantSDNode *Val) {
     return Val->getAPIntValue().uge(OpSizeInBits);
   };
   if (ISD::matchUnaryPredicate(N1, MatchShiftTooBig))
     return DAG.getUNDEF(VT);
   // fold (srl x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // if (srl x, c) is known to be zero, return 0
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
                                    APInt::getAllOnesValue(OpSizeInBits)))
     return DAG.getConstant(0, SDLoc(N), VT);
 
   // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
   if (N0.getOpcode() == ISD::SRL) {
     auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
                                           ConstantSDNode *RHS) {
       APInt c1 = LHS->getAPIntValue();
       APInt c2 = RHS->getAPIntValue();
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).uge(OpSizeInBits);
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
       return DAG.getConstant(0, SDLoc(N), VT);
 
     auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
                                        ConstantSDNode *RHS) {
       APInt c1 = LHS->getAPIntValue();
       APInt c2 = RHS->getAPIntValue();
       zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
       return (c1 + c2).ult(OpSizeInBits);
     };
     if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
       SDLoc DL(N);
       EVT ShiftVT = N1.getValueType();
       SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
       return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
     }
   }
 
   // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2)))
   if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
       N0.getOperand(0).getOpcode() == ISD::SRL) {
     if (auto N001C = isConstOrConstSplat(N0.getOperand(0).getOperand(1))) {
       uint64_t c1 = N001C->getZExtValue();
       uint64_t c2 = N1C->getZExtValue();
       EVT InnerShiftVT = N0.getOperand(0).getValueType();
       EVT ShiftCountVT = N0.getOperand(0).getOperand(1).getValueType();
       uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
       // This is only valid if the OpSizeInBits + c1 = size of inner shift.
       if (c1 + OpSizeInBits == InnerShiftSize) {
         SDLoc DL(N0);
         if (c1 + c2 >= InnerShiftSize)
           return DAG.getConstant(0, DL, VT);
         return DAG.getNode(ISD::TRUNCATE, DL, VT,
                            DAG.getNode(ISD::SRL, DL, InnerShiftVT,
                                        N0.getOperand(0).getOperand(0),
                                        DAG.getConstant(c1 + c2, DL,
                                                        ShiftCountVT)));
       }
     }
   }
 
   // fold (srl (shl x, c), c) -> (and x, cst2)
   if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
       isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
     SDLoc DL(N);
     SDValue Mask =
         DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
     AddToWorklist(Mask.getNode());
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
   }
 
   // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
   if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
     // Shifting in all undef bits?
     EVT SmallVT = N0.getOperand(0).getValueType();
     unsigned BitSize = SmallVT.getScalarSizeInBits();
     if (N1C->getZExtValue() >= BitSize)
       return DAG.getUNDEF(VT);
 
     if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
       uint64_t ShiftAmt = N1C->getZExtValue();
       SDLoc DL0(N0);
       SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT,
                                        N0.getOperand(0),
                           DAG.getConstant(ShiftAmt, DL0,
                                           getShiftAmountTy(SmallVT)));
       AddToWorklist(SmallShift.getNode());
       APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
       SDLoc DL(N);
       return DAG.getNode(ISD::AND, DL, VT,
                          DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
                          DAG.getConstant(Mask, DL, VT));
     }
   }
 
   // fold (srl (sra X, Y), 31) -> (srl X, 31).  This srl only looks at the sign
   // bit, which is unmodified by sra.
   if (N1C && N1C->getZExtValue() + 1 == OpSizeInBits) {
     if (N0.getOpcode() == ISD::SRA)
       return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
   }
 
   // fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit).
   if (N1C && N0.getOpcode() == ISD::CTLZ &&
       N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
     KnownBits Known;
     DAG.computeKnownBits(N0.getOperand(0), Known);
 
     // If any of the input bits are KnownOne, then the input couldn't be all
     // zeros, thus the result of the srl will always be zero.
     if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);
 
     // If all of the bits input the to ctlz node are known to be zero, then
     // the result of the ctlz is "32" and the result of the shift is one.
     APInt UnknownBits = ~Known.Zero;
     if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT);
 
     // Otherwise, check to see if there is exactly one bit input to the ctlz.
     if (UnknownBits.isPowerOf2()) {
       // Okay, we know that only that the single bit specified by UnknownBits
       // could be set on input to the CTLZ node. If this bit is set, the SRL
       // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
       // to an SRL/XOR pair, which is likely to simplify more.
       unsigned ShAmt = UnknownBits.countTrailingZeros();
       SDValue Op = N0.getOperand(0);
 
       if (ShAmt) {
         SDLoc DL(N0);
         Op = DAG.getNode(ISD::SRL, DL, VT, Op,
                   DAG.getConstant(ShAmt, DL,
                                   getShiftAmountTy(Op.getValueType())));
         AddToWorklist(Op.getNode());
       }
 
       SDLoc DL(N);
       return DAG.getNode(ISD::XOR, DL, VT,
                          Op, DAG.getConstant(1, DL, VT));
     }
   }
 
   // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
     if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
       return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
   }
 
   // fold operands of srl based on knowledge that the low bits are not
   // demanded.
   if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   if (N1C && !N1C->isOpaque())
     if (SDValue NewSRL = visitShiftByConstant(N, N1C))
       return NewSRL;
 
   // Attempt to convert a srl of a load into a narrower zero-extending load.
   if (SDValue NarrowLoad = ReduceLoadWidth(N))
     return NarrowLoad;
 
   // Here is a common situation. We want to optimize:
   //
   //   %a = ...
   //   %b = and i32 %a, 2
   //   %c = srl i32 %b, 1
   //   brcond i32 %c ...
   //
   // into
   //
   //   %a = ...
   //   %b = and %a, 2
   //   %c = setcc eq %b, 0
   //   brcond %c ...
   //
   // However when after the source operand of SRL is optimized into AND, the SRL
   // itself may not be optimized further. Look for it and add the BRCOND into
   // the worklist.
   if (N->hasOneUse()) {
     SDNode *Use = *N->use_begin();
     if (Use->getOpcode() == ISD::BRCOND)
       AddToWorklist(Use);
     else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
       // Also look pass the truncate.
       Use = *Use->use_begin();
       if (Use->getOpcode() == ISD::BRCOND)
         AddToWorklist(Use);
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitABS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (abs c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0);
   // fold (abs (abs x)) -> (abs x)
   if (N0.getOpcode() == ISD::ABS)
     return N0;
   // fold (abs x) -> x iff not-negative
   if (DAG.SignBitIsZero(N0))
     return N0;
   return SDValue();
 }
 
 SDValue DAGCombiner::visitBSWAP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (bswap c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0);
   // fold (bswap (bswap x)) -> x
   if (N0.getOpcode() == ISD::BSWAP)
     return N0->getOperand(0);
   return SDValue();
 }
 
 SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (bitreverse c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);
   // fold (bitreverse (bitreverse x)) -> x
   if (N0.getOpcode() == ISD::BITREVERSE)
     return N0.getOperand(0);
   return SDValue();
 }
 
 SDValue DAGCombiner::visitCTLZ(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (ctlz c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
 
   // If the value is known never to be zero, switch to the undef version.
   if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) {
     if (DAG.isKnownNeverZero(N0))
       return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (ctlz_zero_undef c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
   return SDValue();
 }
 
 SDValue DAGCombiner::visitCTTZ(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (cttz c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
 
   // If the value is known never to be zero, switch to the undef version.
   if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) {
     if (DAG.isKnownNeverZero(N0))
       return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (cttz_zero_undef c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
   return SDValue();
 }
 
 SDValue DAGCombiner::visitCTPOP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (ctpop c1) -> c2
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
   return SDValue();
 }
 
 /// Generate Min/Max node
 static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
                                    SDValue RHS, SDValue True, SDValue False,
                                    ISD::CondCode CC, const TargetLowering &TLI,
                                    SelectionDAG &DAG) {
   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
     return SDValue();
 
   switch (CC) {
   case ISD::SETOLT:
   case ISD::SETOLE:
   case ISD::SETLT:
   case ISD::SETLE:
   case ISD::SETULT:
   case ISD::SETULE: {
     unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
     if (TLI.isOperationLegal(Opcode, VT))
       return DAG.getNode(Opcode, DL, VT, LHS, RHS);
     return SDValue();
   }
   case ISD::SETOGT:
   case ISD::SETOGE:
   case ISD::SETGT:
   case ISD::SETGE:
   case ISD::SETUGT:
   case ISD::SETUGE: {
     unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
     if (TLI.isOperationLegal(Opcode, VT))
       return DAG.getNode(Opcode, DL, VT, LHS, RHS);
     return SDValue();
   }
   default:
     return SDValue();
   }
 }
 
 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   SDValue Cond = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
   EVT VT = N->getValueType(0);
   EVT CondVT = Cond.getValueType();
   SDLoc DL(N);
 
   if (!VT.isInteger())
     return SDValue();
 
   auto *C1 = dyn_cast<ConstantSDNode>(N1);
   auto *C2 = dyn_cast<ConstantSDNode>(N2);
   if (!C1 || !C2)
     return SDValue();
 
   // Only do this before legalization to avoid conflicting with target-specific
   // transforms in the other direction (create a select from a zext/sext). There
   // is also a target-independent combine here in DAGCombiner in the other
   // direction for (select Cond, -1, 0) when the condition is not i1.
   if (CondVT == MVT::i1 && !LegalOperations) {
     if (C1->isNullValue() && C2->isOne()) {
       // select Cond, 0, 1 --> zext (!Cond)
       SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
       if (VT != MVT::i1)
         NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
       return NotCond;
     }
     if (C1->isNullValue() && C2->isAllOnesValue()) {
       // select Cond, 0, -1 --> sext (!Cond)
       SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
       if (VT != MVT::i1)
         NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
       return NotCond;
     }
     if (C1->isOne() && C2->isNullValue()) {
       // select Cond, 1, 0 --> zext (Cond)
       if (VT != MVT::i1)
         Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
       return Cond;
     }
     if (C1->isAllOnesValue() && C2->isNullValue()) {
       // select Cond, -1, 0 --> sext (Cond)
       if (VT != MVT::i1)
         Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
       return Cond;
     }
 
     // For any constants that differ by 1, we can transform the select into an
     // extend and add. Use a target hook because some targets may prefer to
     // transform in the other direction.
     if (TLI.convertSelectOfConstantsToMath(VT)) {
       if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {
         // select Cond, C1, C1-1 --> add (zext Cond), C1-1
         if (VT != MVT::i1)
           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
         return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
       }
       if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) {
         // select Cond, C1, C1+1 --> add (sext Cond), C1+1
         if (VT != MVT::i1)
           Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
         return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
       }
     }
 
     return SDValue();
   }
 
   // fold (select Cond, 0, 1) -> (xor Cond, 1)
   // We can't do this reliably if integer based booleans have different contents
   // to floating point based booleans. This is because we can't tell whether we
   // have an integer-based boolean or a floating-point-based boolean unless we
   // can find the SETCC that produced it and inspect its operands. This is
   // fairly easy if C is the SETCC node, but it can potentially be
   // undiscoverable (or not reasonably discoverable). For example, it could be
   // in another basic block or it could require searching a complicated
   // expression.
   if (CondVT.isInteger() &&
       TLI.getBooleanContents(/*isVec*/false, /*isFloat*/true) ==
           TargetLowering::ZeroOrOneBooleanContent &&
       TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) ==
           TargetLowering::ZeroOrOneBooleanContent &&
       C1->isNullValue() && C2->isOne()) {
     SDValue NotCond =
         DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
     if (VT.bitsEq(CondVT))
       return NotCond;
     return DAG.getZExtOrTrunc(NotCond, DL, VT);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
   EVT VT = N->getValueType(0);
   EVT VT0 = N0.getValueType();
   SDLoc DL(N);
 
   // fold (select C, X, X) -> X
   if (N1 == N2)
     return N1;
 
   if (const ConstantSDNode *N0C = dyn_cast<const ConstantSDNode>(N0)) {
     // fold (select true, X, Y) -> X
     // fold (select false, X, Y) -> Y
     return !N0C->isNullValue() ? N1 : N2;
   }
 
   // fold (select X, X, Y) -> (or X, Y)
   // fold (select X, 1, Y) -> (or C, Y)
   if (VT == VT0 && VT == MVT::i1 && (N0 == N1 || isOneConstant(N1)))
     return DAG.getNode(ISD::OR, DL, VT, N0, N2);
 
   if (SDValue V = foldSelectOfConstants(N))
     return V;
 
   // fold (select C, 0, X) -> (and (not C), X)
   if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) {
     SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
     AddToWorklist(NOTNode.getNode());
     return DAG.getNode(ISD::AND, DL, VT, NOTNode, N2);
   }
   // fold (select C, X, 1) -> (or (not C), X)
   if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) {
     SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
     AddToWorklist(NOTNode.getNode());
     return DAG.getNode(ISD::OR, DL, VT, NOTNode, N1);
   }
   // fold (select X, Y, X) -> (and X, Y)
   // fold (select X, Y, 0) -> (and X, Y)
   if (VT == VT0 && VT == MVT::i1 && (N0 == N2 || isNullConstant(N2)))
     return DAG.getNode(ISD::AND, DL, VT, N0, N1);
 
   // If we can fold this based on the true/false value, do so.
   if (SimplifySelectOps(N, N1, N2))
     return SDValue(N, 0); // Don't revisit N.
 
   if (VT0 == MVT::i1) {
     // The code in this block deals with the following 2 equivalences:
     //    select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
     //    select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
     // The target can specify its preferred form with the
     // shouldNormalizeToSelectSequence() callback. However we always transform
     // to the right anyway if we find the inner select exists in the DAG anyway
     // and we always transform to the left side if we know that we can further
     // optimize the combination of the conditions.
     bool normalizeToSequence =
         TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
     // select (and Cond0, Cond1), X, Y
     //   -> select Cond0, (select Cond1, X, Y), Y
     if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
       SDValue Cond0 = N0->getOperand(0);
       SDValue Cond1 = N0->getOperand(1);
       SDValue InnerSelect =
           DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2);
       if (normalizeToSequence || !InnerSelect.use_empty())
         return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0,
                            InnerSelect, N2);
     }
     // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
     if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
       SDValue Cond0 = N0->getOperand(0);
       SDValue Cond1 = N0->getOperand(1);
       SDValue InnerSelect =
           DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2);
       if (normalizeToSequence || !InnerSelect.use_empty())
         return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1,
                            InnerSelect);
     }
 
     // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
     if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
       SDValue N1_0 = N1->getOperand(0);
       SDValue N1_1 = N1->getOperand(1);
       SDValue N1_2 = N1->getOperand(2);
       if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
         // Create the actual and node if we can generate good code for it.
         if (!normalizeToSequence) {
           SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
           return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1, N2);
         }
         // Otherwise see if we can optimize the "and" to a better pattern.
         if (SDValue Combined = visitANDLike(N0, N1_0, N))
           return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1,
                              N2);
       }
     }
     // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
     if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
       SDValue N2_0 = N2->getOperand(0);
       SDValue N2_1 = N2->getOperand(1);
       SDValue N2_2 = N2->getOperand(2);
       if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
         // Create the actual or node if we can generate good code for it.
         if (!normalizeToSequence) {
           SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
           return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, N2_2);
         }
         // Otherwise see if we can optimize to a better pattern.
         if (SDValue Combined = visitORLike(N0, N2_0, N))
           return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1,
                              N2_2);
       }
     }
   }
 
   if (VT0 == MVT::i1) {
     // select (not Cond), N1, N2 -> select Cond, N2, N1
     if (isBitwiseNot(N0))
       return DAG.getNode(ISD::SELECT, DL, VT, N0->getOperand(0), N2, N1);
   }
 
   // fold selects based on a setcc into other things, such as min/max/abs
   if (N0.getOpcode() == ISD::SETCC) {
     // select x, y (fcmp lt x, y) -> fminnum x, y
     // select x, y (fcmp gt x, y) -> fmaxnum x, y
     //
     // This is OK if we don't care about what happens if either operand is a
     // NaN.
     //
 
     // FIXME: Instead of testing for UnsafeFPMath, this should be checking for
     // no signed zeros as well as no nans.
     const TargetOptions &Options = DAG.getTarget().Options;
     if (Options.UnsafeFPMath && VT.isFloatingPoint() && N0.hasOneUse() &&
         DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
       ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
 
       if (SDValue FMinMax = combineMinNumMaxNum(
               DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG))
         return FMinMax;
     }
 
     if ((!LegalOperations &&
          TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
         TLI.isOperationLegal(ISD::SELECT_CC, VT))
       return DAG.getNode(ISD::SELECT_CC, DL, VT, N0.getOperand(0),
                          N0.getOperand(1), N1, N2, N0.getOperand(2));
     return SimplifySelect(DL, N0, N1, N2);
   }
 
   return SDValue();
 }
 
 static
 std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   // Split the inputs.
   SDValue Lo, Hi, LL, LH, RL, RH;
   std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
   std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
 
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
 
   return std::make_pair(Lo, Hi);
 }
 
 // This function assumes all the vselect's arguments are CONCAT_VECTOR
 // nodes and that the condition is a BV of ConstantSDNodes (or undefs).
 static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   SDValue Cond = N->getOperand(0);
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   EVT VT = N->getValueType(0);
   int NumElems = VT.getVectorNumElements();
   assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
          RHS.getOpcode() == ISD::CONCAT_VECTORS &&
          Cond.getOpcode() == ISD::BUILD_VECTOR);
 
   // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
   // binary ones here.
   if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2)
     return SDValue();
 
   // We're sure we have an even number of elements due to the
   // concat_vectors we have as arguments to vselect.
   // Skip BV elements until we find one that's not an UNDEF
   // After we find an UNDEF element, keep looping until we get to half the
   // length of the BV and see if all the non-undef nodes are the same.
   ConstantSDNode *BottomHalf = nullptr;
   for (int i = 0; i < NumElems / 2; ++i) {
     if (Cond->getOperand(i)->isUndef())
       continue;
 
     if (BottomHalf == nullptr)
       BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
     else if (Cond->getOperand(i).getNode() != BottomHalf)
       return SDValue();
   }
 
   // Do the same for the second half of the BuildVector
   ConstantSDNode *TopHalf = nullptr;
   for (int i = NumElems / 2; i < NumElems; ++i) {
     if (Cond->getOperand(i)->isUndef())
       continue;
 
     if (TopHalf == nullptr)
       TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
     else if (Cond->getOperand(i).getNode() != TopHalf)
       return SDValue();
   }
 
   assert(TopHalf && BottomHalf &&
          "One half of the selector was all UNDEFs and the other was all the "
          "same value. This should have been addressed before this function.");
   return DAG.getNode(
       ISD::CONCAT_VECTORS, DL, VT,
       BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0),
       TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
 }
 
 SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
   if (Level >= AfterLegalizeTypes)
     return SDValue();
 
   MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
   SDValue Mask = MSC->getMask();
   SDValue Data  = MSC->getValue();
   SDLoc DL(N);
 
   // If the MSCATTER data type requires splitting and the mask is provided by a
   // SETCC, then split both nodes and its operands before legalization. This
   // prevents the type legalizer from unrolling SETCC into scalar comparisons
   // and enables future optimizations (e.g. min/max pattern matching on X86).
   if (Mask.getOpcode() != ISD::SETCC)
     return SDValue();
 
   // Check if any splitting is required.
   if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
       TargetLowering::TypeSplitVector)
     return SDValue();
   SDValue MaskLo, MaskHi, Lo, Hi;
   std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
 
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0));
 
   SDValue Chain = MSC->getChain();
 
   EVT MemoryVT = MSC->getMemoryVT();
   unsigned Alignment = MSC->getOriginalAlignment();
 
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue DataLo, DataHi;
   std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
   SDValue Scale = MSC->getScale();
   SDValue BasePtr = MSC->getBasePtr();
   SDValue IndexLo, IndexHi;
   std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MSC->getPointerInfo(),
                           MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
                           Alignment, MSC->getAAInfo(), MSC->getRanges());
 
   SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo, Scale };
   Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
                             DL, OpsLo, MMO);
 
   SDValue OpsHi[] = { Chain, DataHi, MaskHi, BasePtr, IndexHi, Scale };
   Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
                             DL, OpsHi, MMO);
 
   AddToWorklist(Lo.getNode());
   AddToWorklist(Hi.getNode());
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
 
 SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   if (Level >= AfterLegalizeTypes)
     return SDValue();
 
   MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N);
   SDValue Mask = MST->getMask();
   SDValue Data  = MST->getValue();
   EVT VT = Data.getValueType();
   SDLoc DL(N);
 
   // If the MSTORE data type requires splitting and the mask is provided by a
   // SETCC, then split both nodes and its operands before legalization. This
   // prevents the type legalizer from unrolling SETCC into scalar comparisons
   // and enables future optimizations (e.g. min/max pattern matching on X86).
   if (Mask.getOpcode() == ISD::SETCC) {
     // Check if any splitting is required.
     if (TLI.getTypeAction(*DAG.getContext(), VT) !=
         TargetLowering::TypeSplitVector)
       return SDValue();
 
     SDValue MaskLo, MaskHi, Lo, Hi;
     std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
 
     SDValue Chain = MST->getChain();
     SDValue Ptr   = MST->getBasePtr();
 
     EVT MemoryVT = MST->getMemoryVT();
     unsigned Alignment = MST->getOriginalAlignment();
 
     // if Alignment is equal to the vector size,
     // take the half of it for the second part
     unsigned SecondHalfAlignment =
       (Alignment == VT.getSizeInBits() / 8) ? Alignment / 2 : Alignment;
 
     EVT LoMemVT, HiMemVT;
     std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
     SDValue DataLo, DataHi;
     std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
     MachineMemOperand *MMO = DAG.getMachineFunction().
       getMachineMemOperand(MST->getPointerInfo(),
                            MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
                            Alignment, MST->getAAInfo(), MST->getRanges());
 
     Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
                             MST->isTruncatingStore(),
                             MST->isCompressingStore());
 
     Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
                                      MST->isCompressingStore());
     unsigned HiOffset = LoMemVT.getStoreSize();
 
     MMO = DAG.getMachineFunction().getMachineMemOperand(
         MST->getPointerInfo().getWithOffset(HiOffset),
         MachineMemOperand::MOStore, HiMemVT.getStoreSize(), SecondHalfAlignment,
         MST->getAAInfo(), MST->getRanges());
 
     Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
                             MST->isTruncatingStore(),
                             MST->isCompressingStore());
 
     AddToWorklist(Lo.getNode());
     AddToWorklist(Hi.getNode());
 
     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
   }
   return SDValue();
 }
 
 SDValue DAGCombiner::visitMGATHER(SDNode *N) {
   if (Level >= AfterLegalizeTypes)
     return SDValue();
 
   MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
   SDValue Mask = MGT->getMask();
   SDLoc DL(N);
 
   // If the MGATHER result requires splitting and the mask is provided by a
   // SETCC, then split both nodes and its operands before legalization. This
   // prevents the type legalizer from unrolling SETCC into scalar comparisons
   // and enables future optimizations (e.g. min/max pattern matching on X86).
 
   if (Mask.getOpcode() != ISD::SETCC)
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   // Check if any splitting is required.
   if (TLI.getTypeAction(*DAG.getContext(), VT) !=
       TargetLowering::TypeSplitVector)
     return SDValue();
 
   SDValue MaskLo, MaskHi, Lo, Hi;
   std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
 
   SDValue Src0 = MGT->getValue();
   SDValue Src0Lo, Src0Hi;
   std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);
 
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
 
   SDValue Chain = MGT->getChain();
   EVT MemoryVT = MGT->getMemoryVT();
   unsigned Alignment = MGT->getOriginalAlignment();
 
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue Scale = MGT->getScale();
   SDValue BasePtr = MGT->getBasePtr();
   SDValue Index = MGT->getIndex();
   SDValue IndexLo, IndexHi;
   std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
 
   MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MGT->getPointerInfo(),
                           MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                           Alignment, MGT->getAAInfo(), MGT->getRanges());
 
   SDValue OpsLo[] = { Chain, Src0Lo, MaskLo, BasePtr, IndexLo, Scale };
   Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo,
                            MMO);
 
   SDValue OpsHi[] = { Chain, Src0Hi, MaskHi, BasePtr, IndexHi, Scale };
   Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi,
                            MMO);
 
   AddToWorklist(Lo.getNode());
   AddToWorklist(Hi.getNode());
 
   // Build a factor node to remember that this load is independent of the
   // other one.
   Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
                       Hi.getValue(1));
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   DAG.ReplaceAllUsesOfValueWith(SDValue(MGT, 1), Chain);
 
   SDValue GatherRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
 
   SDValue RetOps[] = { GatherRes, Chain };
   return DAG.getMergeValues(RetOps, DL);
 }
 
 SDValue DAGCombiner::visitMLOAD(SDNode *N) {
   if (Level >= AfterLegalizeTypes)
     return SDValue();
 
   MaskedLoadSDNode *MLD = dyn_cast<MaskedLoadSDNode>(N);
   SDValue Mask = MLD->getMask();
   SDLoc DL(N);
 
   // If the MLOAD result requires splitting and the mask is provided by a
   // SETCC, then split both nodes and its operands before legalization. This
   // prevents the type legalizer from unrolling SETCC into scalar comparisons
   // and enables future optimizations (e.g. min/max pattern matching on X86).
   if (Mask.getOpcode() == ISD::SETCC) {
     EVT VT = N->getValueType(0);
 
     // Check if any splitting is required.
     if (TLI.getTypeAction(*DAG.getContext(), VT) !=
         TargetLowering::TypeSplitVector)
       return SDValue();
 
     SDValue MaskLo, MaskHi, Lo, Hi;
     std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
 
     SDValue Src0 = MLD->getSrc0();
     SDValue Src0Lo, Src0Hi;
     std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);
 
     EVT LoVT, HiVT;
     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
 
     SDValue Chain = MLD->getChain();
     SDValue Ptr   = MLD->getBasePtr();
     EVT MemoryVT = MLD->getMemoryVT();
     unsigned Alignment = MLD->getOriginalAlignment();
 
     // if Alignment is equal to the vector size,
     // take the half of it for the second part
     unsigned SecondHalfAlignment =
       (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
          Alignment/2 : Alignment;
 
     EVT LoMemVT, HiMemVT;
     std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
     MachineMemOperand *MMO = DAG.getMachineFunction().
     getMachineMemOperand(MLD->getPointerInfo(),
                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
                          Alignment, MLD->getAAInfo(), MLD->getRanges());
 
     Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
                            ISD::NON_EXTLOAD, MLD->isExpandingLoad());
 
     Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
                                      MLD->isExpandingLoad());
     unsigned HiOffset = LoMemVT.getStoreSize();
 
     MMO = DAG.getMachineFunction().getMachineMemOperand(
         MLD->getPointerInfo().getWithOffset(HiOffset),
         MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), SecondHalfAlignment,
         MLD->getAAInfo(), MLD->getRanges());
 
     Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
                            ISD::NON_EXTLOAD, MLD->isExpandingLoad());
 
     AddToWorklist(Lo.getNode());
     AddToWorklist(Hi.getNode());
 
     // Build a factor node to remember that this load is independent of the
     // other one.
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
                         Hi.getValue(1));
 
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
     DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
 
     SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
 
     SDValue RetOps[] = { LoadRes, Chain };
     return DAG.getMergeValues(RetOps, DL);
   }
   return SDValue();
 }
 
 /// A vector select of 2 constant vectors can be simplified to math/logic to
 /// avoid a variable select instruction and possibly avoid constant loads.
 SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
   SDValue Cond = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
   EVT VT = N->getValueType(0);
   if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 ||
       !TLI.convertSelectOfConstantsToMath(VT) ||
       !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) ||
       !ISD::isBuildVectorOfConstantSDNodes(N2.getNode()))
     return SDValue();
 
   // Check if we can use the condition value to increment/decrement a single
   // constant value. This simplifies a select to an add and removes a constant
   // load/materialization from the general case.
   bool AllAddOne = true;
   bool AllSubOne = true;
   unsigned Elts = VT.getVectorNumElements();
   for (unsigned i = 0; i != Elts; ++i) {
     SDValue N1Elt = N1.getOperand(i);
     SDValue N2Elt = N2.getOperand(i);
     if (N1Elt.isUndef() || N2Elt.isUndef())
       continue;
 
     const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
     const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
     if (C1 != C2 + 1)
       AllAddOne = false;
     if (C1 != C2 - 1)
       AllSubOne = false;
   }
 
   // Further simplifications for the extra-special cases where the constants are
   // all 0 or all -1 should be implemented as folds of these patterns.
   SDLoc DL(N);
   if (AllAddOne || AllSubOne) {
     // vselect <N x i1> Cond, C+1, C --> add (zext Cond), C
     // vselect <N x i1> Cond, C-1, C --> add (sext Cond), C
     auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
     SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond);
     return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
   }
 
   // The general case for select-of-constants:
   // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
   // ...but that only makes sense if a vselect is slower than 2 logic ops, so
   // leave that to a machine-specific pass.
   return SDValue();
 }
 
 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
   SDLoc DL(N);
 
   // fold (vselect C, X, X) -> X
   if (N1 == N2)
     return N1;
 
   // Canonicalize integer abs.
   // vselect (setg[te] X,  0),  X, -X ->
   // vselect (setgt    X, -1),  X, -X ->
   // vselect (setl[te] X,  0), -X,  X ->
   // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
   if (N0.getOpcode() == ISD::SETCC) {
     SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
     bool isAbs = false;
     bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
 
     if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
          (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) &&
         N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1))
       isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode());
     else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) &&
              N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1))
       isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
 
     if (isAbs) {
       EVT VT = LHS.getValueType();
       if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
         return DAG.getNode(ISD::ABS, DL, VT, LHS);
 
       SDValue Shift = DAG.getNode(
           ISD::SRA, DL, VT, LHS,
           DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
       SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
       AddToWorklist(Shift.getNode());
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
     }
 
     // If this select has a condition (setcc) with narrower operands than the
     // select, try to widen the compare to match the select width.
     // TODO: This should be extended to handle any constant.
     // TODO: This could be extended to handle non-loading patterns, but that
     //       requires thorough testing to avoid regressions.
     if (isNullConstantOrNullSplatConstant(RHS)) {
       EVT NarrowVT = LHS.getValueType();
       EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger();
       EVT SetCCVT = getSetCCResultType(LHS.getValueType());
       unsigned SetCCWidth = SetCCVT.getScalarSizeInBits();
       unsigned WideWidth = WideVT.getScalarSizeInBits();
       bool IsSigned = isSignedIntSetCC(CC);
       auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
       if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() &&
           SetCCWidth != 1 && SetCCWidth < WideWidth &&
           TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) &&
           TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) {
         // Both compare operands can be widened for free. The LHS can use an
         // extended load, and the RHS is a constant:
         //   vselect (ext (setcc load(X), C)), N1, N2 -->
         //   vselect (setcc extload(X), C'), N1, N2
         auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
         SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS);
         SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS);
         EVT WideSetCCVT = getSetCCResultType(WideVT);
         SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC);
         return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2);
       }
     }
   }
 
   if (SimplifySelectOps(N, N1, N2))
     return SDValue(N, 0);  // Don't revisit N.
 
   // Fold (vselect (build_vector all_ones), N1, N2) -> N1
   if (ISD::isBuildVectorAllOnes(N0.getNode()))
     return N1;
   // Fold (vselect (build_vector all_zeros), N1, N2) -> N2
   if (ISD::isBuildVectorAllZeros(N0.getNode()))
     return N2;
 
   // The ConvertSelectToConcatVector function is assuming both the above
   // checks for (vselect (build_vector all{ones,zeros) ...) have been made
   // and addressed.
   if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
       N2.getOpcode() == ISD::CONCAT_VECTORS &&
       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
     if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
       return CV;
   }
 
   if (SDValue V = foldVSelectOfConstants(N))
     return V;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
   SDValue N3 = N->getOperand(3);
   SDValue N4 = N->getOperand(4);
   ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();
 
   // fold select_cc lhs, rhs, x, x, cc -> x
   if (N2 == N3)
     return N2;
 
   // Determine if the condition we're dealing with is constant
   if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1,
                                   CC, SDLoc(N), false)) {
     AddToWorklist(SCC.getNode());
 
     if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
       if (!SCCC->isNullValue())
         return N2;    // cond always true -> true val
       else
         return N3;    // cond always false -> false val
     } else if (SCC->isUndef()) {
       // When the condition is UNDEF, just return the first operand. This is
       // coherent the DAG creation, no setcc node is created in this case
       return N2;
     } else if (SCC.getOpcode() == ISD::SETCC) {
       // Fold to a simpler select_cc
       return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(),
                          SCC.getOperand(0), SCC.getOperand(1), N2, N3,
                          SCC.getOperand(2));
     }
   }
 
   // If we can fold this based on the true/false value, do so.
   if (SimplifySelectOps(N, N2, N3))
     return SDValue(N, 0);  // Don't revisit N.
 
   // fold select_cc into other things, such as min/max/abs
   return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC);
 }
 
 SDValue DAGCombiner::visitSETCC(SDNode *N) {
   // setcc is very commonly used as an argument to brcond. This pattern
   // also lend itself to numerous combines and, as a result, it is desired
   // we keep the argument to a brcond as a setcc as much as possible.
   bool PreferSetCC =
       N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND;
 
   SDValue Combined = SimplifySetCC(
       N->getValueType(0), N->getOperand(0), N->getOperand(1),
       cast<CondCodeSDNode>(N->getOperand(2))->get(), SDLoc(N), !PreferSetCC);
 
   if (!Combined)
     return SDValue();
 
   // If we prefer to have a setcc, and we don't, we'll try our best to
   // recreate one using rebuildSetCC.
   if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) {
     SDValue NewSetCC = rebuildSetCC(Combined);
 
     // We don't have anything interesting to combine to.
     if (NewSetCC.getNode() == N)
       return SDValue();
 
     if (NewSetCC)
       return NewSetCC;
   }
 
   return Combined;
 }
 
 SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   SDValue Carry = N->getOperand(2);
   SDValue Cond = N->getOperand(3);
 
   // If Carry is false, fold to a regular SETCC.
   if (isNullConstant(Carry))
     return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);
 
   return SDValue();
 }
 
 /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
 /// a build_vector of constants.
 /// This function is called by the DAGCombiner when visiting sext/zext/aext
 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
 /// Vector extends are not folded if operations are legal; this is to
 /// avoid introducing illegal build_vector dag nodes.
 static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
                                          SelectionDAG &DAG, bool LegalTypes,
                                          bool LegalOperations) {
   unsigned Opcode = N->getOpcode();
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
          Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
          Opcode == ISD::ZERO_EXTEND_VECTOR_INREG)
          && "Expected EXTEND dag node in input!");
 
   // fold (sext c1) -> c1
   // fold (zext c1) -> c1
   // fold (aext c1) -> c1
   if (isa<ConstantSDNode>(N0))
     return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode();
 
   // fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
   // fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
   // fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
   EVT SVT = VT.getScalarType();
   if (!(VT.isVector() &&
       (!LegalTypes || (!LegalOperations && TLI.isTypeLegal(SVT))) &&
       ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
     return nullptr;
 
   // We can fold this node into a build_vector.
   unsigned VTBits = SVT.getSizeInBits();
   unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits();
   SmallVector<SDValue, 8> Elts;
   unsigned NumElts = VT.getVectorNumElements();
   SDLoc DL(N);
 
   for (unsigned i=0; i != NumElts; ++i) {
     SDValue Op = N0->getOperand(i);
     if (Op->isUndef()) {
       Elts.push_back(DAG.getUNDEF(SVT));
       continue;
     }
 
     SDLoc DL(Op);
     // Get the constant value and if needed trunc it to the size of the type.
     // Nodes like build_vector might have constants wider than the scalar type.
     APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits);
     if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
       Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT));
     else
       Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
   }
 
   return DAG.getBuildVector(VT, DL, Elts).getNode();
 }
 
 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
 // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
 // transformation. Returns true if extension are possible and the above
 // mentioned transformation is profitable.
 static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
                                     unsigned ExtOpc,
                                     SmallVectorImpl<SDNode *> &ExtendNodes,
                                     const TargetLowering &TLI) {
   bool HasCopyToRegUses = false;
   bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType());
   for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
                             UE = N0.getNode()->use_end();
        UI != UE; ++UI) {
     SDNode *User = *UI;
     if (User == N)
       continue;
     if (UI.getUse().getResNo() != N0.getResNo())
       continue;
     // FIXME: Only extend SETCC N, N and SETCC N, c for now.
     if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
       ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
       if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
         // Sign bits will be lost after a zext.
         return false;
       bool Add = false;
       for (unsigned i = 0; i != 2; ++i) {
         SDValue UseOp = User->getOperand(i);
         if (UseOp == N0)
           continue;
         if (!isa<ConstantSDNode>(UseOp))
           return false;
         Add = true;
       }
       if (Add)
         ExtendNodes.push_back(User);
       continue;
     }
     // If truncates aren't free and there are users we can't
     // extend, it isn't worthwhile.
     if (!isTruncFree)
       return false;
     // Remember if this value is live-out.
     if (User->getOpcode() == ISD::CopyToReg)
       HasCopyToRegUses = true;
   }
 
   if (HasCopyToRegUses) {
     bool BothLiveOut = false;
     for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
          UI != UE; ++UI) {
       SDUse &Use = UI.getUse();
       if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
         BothLiveOut = true;
         break;
       }
     }
     if (BothLiveOut)
       // Both unextended and extended values are live out. There had better be
       // a good reason for the transformation.
       return ExtendNodes.size();
   }
   return true;
 }
 
 void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
                                   SDValue OrigLoad, SDValue ExtLoad,
                                   ISD::NodeType ExtType) {
   // Extend SetCC uses if necessary.
   SDLoc DL(ExtLoad);
   for (SDNode *SetCC : SetCCs) {
     SmallVector<SDValue, 4> Ops;
 
     for (unsigned j = 0; j != 2; ++j) {
       SDValue SOp = SetCC->getOperand(j);
       if (SOp == OrigLoad)
         Ops.push_back(ExtLoad);
       else
         Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
     }
 
     Ops.push_back(SetCC->getOperand(2));
     CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
   }
 }
 
 // FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
 SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT DstVT = N->getValueType(0);
   EVT SrcVT = N0.getValueType();
 
   assert((N->getOpcode() == ISD::SIGN_EXTEND ||
           N->getOpcode() == ISD::ZERO_EXTEND) &&
          "Unexpected node type (not an extend)!");
 
   // fold (sext (load x)) to multiple smaller sextloads; same for zext.
   // For example, on a target with legal v4i32, but illegal v8i32, turn:
   //   (v8i32 (sext (v8i16 (load x))))
   // into:
   //   (v8i32 (concat_vectors (v4i32 (sextload x)),
   //                          (v4i32 (sextload (x + 16)))))
   // Where uses of the original load, i.e.:
   //   (v8i16 (load x))
   // are replaced with:
   //   (v8i16 (truncate
   //     (v8i32 (concat_vectors (v4i32 (sextload x)),
   //                            (v4i32 (sextload (x + 16)))))))
   //
   // This combine is only applicable to illegal, but splittable, vectors.
   // All legal types, and illegal non-vector types, are handled elsewhere.
   // This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
   //
   if (N0->getOpcode() != ISD::LOAD)
     return SDValue();
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
 
   if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
       !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() ||
       !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
     return SDValue();
 
   SmallVector<SDNode *, 4> SetCCs;
   if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI))
     return SDValue();
 
   ISD::LoadExtType ExtType =
       N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
 
   // Try to split the vector types to get down to legal types.
   EVT SplitSrcVT = SrcVT;
   EVT SplitDstVT = DstVT;
   while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
          SplitSrcVT.getVectorNumElements() > 1) {
     SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
     SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
   }
 
   if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
     return SDValue();
 
   SDLoc DL(N);
   const unsigned NumSplits =
       DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
   const unsigned Stride = SplitSrcVT.getStoreSize();
   SmallVector<SDValue, 4> Loads;
   SmallVector<SDValue, 4> Chains;
 
   SDValue BasePtr = LN0->getBasePtr();
   for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
     const unsigned Offset = Idx * Stride;
     const unsigned Align = MinAlign(LN0->getAlignment(), Offset);
 
     SDValue SplitLoad = DAG.getExtLoad(
         ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr,
         LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
         LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
 
     BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
                           DAG.getConstant(Stride, DL, BasePtr.getValueType()));
 
     Loads.push_back(SplitLoad.getValue(0));
     Chains.push_back(SplitLoad.getValue(1));
   }
 
   SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
 
   // Simplify TF.
   AddToWorklist(NewChain.getNode());
 
   CombineTo(N, NewValue);
 
   // Replace uses of the original load (before extension)
   // with a truncate of the concatenated sextloaded vectors.
   SDValue Trunc =
       DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
   ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode());
   CombineTo(N0.getNode(), Trunc, NewChain);
   return SDValue(N, 0); // Return N so it doesn't get rechecked!
 }
 
 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
 //      (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
 SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {
   assert(N->getOpcode() == ISD::ZERO_EXTEND);
   EVT VT = N->getValueType(0);
 
   // and/or/xor
   SDValue N0 = N->getOperand(0);
   if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
         N0.getOpcode() == ISD::XOR) ||
       N0.getOperand(1).getOpcode() != ISD::Constant ||
       (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT)))
     return SDValue();
 
   // shl/shr
   SDValue N1 = N0->getOperand(0);
   if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) ||
       N1.getOperand(1).getOpcode() != ISD::Constant ||
       (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT)))
     return SDValue();
 
   // load
   if (!isa<LoadSDNode>(N1.getOperand(0)))
     return SDValue();
   LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0));
   EVT MemVT = Load->getMemoryVT();
   if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) ||
       Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed())
     return SDValue();
 
 
   // If the shift op is SHL, the logic op must be AND, otherwise the result
   // will be wrong.
   if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND)
     return SDValue();
 
   if (!N0.hasOneUse() || !N1.hasOneUse())
     return SDValue();
 
   SmallVector<SDNode*, 4> SetCCs;
   if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0),
                                ISD::ZERO_EXTEND, SetCCs, TLI))
     return SDValue();
 
   // Actually do the transformation.
   SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT,
                                    Load->getChain(), Load->getBasePtr(),
                                    Load->getMemoryVT(), Load->getMemOperand());
 
   SDLoc DL1(N1);
   SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,
                               N1.getOperand(1));
 
   APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
   Mask = Mask.zext(VT.getSizeInBits());
   SDLoc DL0(N0);
   SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,
                             DAG.getConstant(Mask, DL0, VT));
 
   ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
   CombineTo(N, And);
   if (SDValue(Load, 0).hasOneUse()) {
     DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1));
   } else {
     SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load),
                                 Load->getValueType(0), ExtLoad);
     CombineTo(Load, Trunc, ExtLoad.getValue(1));
   }
   return SDValue(N,0); // Return N so it doesn't get rechecked!
 }
 
 /// If we're narrowing or widening the result of a vector select and the final
 /// size is the same size as a setcc (compare) feeding the select, then try to
 /// apply the cast operation to the select's operands because matching vector
 /// sizes for a select condition and other operands should be more efficient.
 SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
   unsigned CastOpcode = Cast->getOpcode();
   assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND ||
           CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND ||
           CastOpcode == ISD::FP_ROUND) &&
          "Unexpected opcode for vector select narrowing/widening");
 
   // We only do this transform before legal ops because the pattern may be
   // obfuscated by target-specific operations after legalization. Do not create
   // an illegal select op, however, because that may be difficult to lower.
   EVT VT = Cast->getValueType(0);
   if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
     return SDValue();
 
   SDValue VSel = Cast->getOperand(0);
   if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() ||
       VSel.getOperand(0).getOpcode() != ISD::SETCC)
     return SDValue();
 
   // Does the setcc have the same vector size as the casted select?
   SDValue SetCC = VSel.getOperand(0);
   EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
   if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
     return SDValue();
 
   // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
   SDValue A = VSel.getOperand(1);
   SDValue B = VSel.getOperand(2);
   SDValue CastA, CastB;
   SDLoc DL(Cast);
   if (CastOpcode == ISD::FP_ROUND) {
     // FP_ROUND (fptrunc) has an extra flag operand to pass along.
     CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
     CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
   } else {
     CastA = DAG.getNode(CastOpcode, DL, VT, A);
     CastB = DAG.getNode(CastOpcode, DL, VT, B);
   }
   return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
 }
 
 // fold ([s|z]ext ([s|z]extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
 // fold ([s|z]ext (     extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
 static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
                                      const TargetLowering &TLI, EVT VT,
                                      bool LegalOperations, SDNode *N,
                                      SDValue N0, ISD::LoadExtType ExtLoadType) {
   SDNode *N0Node = N0.getNode();
   bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node)
                                                    : ISD::isZEXTLoad(N0Node);
   if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) ||
       !ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse())
     return {};
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   EVT MemVT = LN0->getMemoryVT();
   if ((LegalOperations || LN0->isVolatile()) &&
       !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
     return {};
 
   SDValue ExtLoad =
       DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
                      LN0->getBasePtr(), MemVT, LN0->getMemOperand());
   Combiner.CombineTo(N, ExtLoad);
   DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
   return SDValue(N, 0); // Return N so it doesn't get rechecked!
 }
 
 // fold ([s|z]ext (load x)) -> ([s|z]ext (truncate ([s|z]extload x)))
 // Only generate vector extloads when 1) they're legal, and 2) they are
 // deemed desirable by the target.
 static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
                                   const TargetLowering &TLI, EVT VT,
                                   bool LegalOperations, SDNode *N, SDValue N0,
                                   ISD::LoadExtType ExtLoadType,
                                   ISD::NodeType ExtOpc) {
   if (!ISD::isNON_EXTLoad(N0.getNode()) ||
       !ISD::isUNINDEXEDLoad(N0.getNode()) ||
       ((LegalOperations || VT.isVector() ||
         cast<LoadSDNode>(N0)->isVolatile()) &&
        !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
     return {};
 
   bool DoXform = true;
   SmallVector<SDNode *, 4> SetCCs;
   if (!N0.hasOneUse())
     DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI);
   if (VT.isVector())
     DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
   if (!DoXform)
     return {};
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
                                    LN0->getBasePtr(), N0.getValueType(),
                                    LN0->getMemOperand());
   Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc);
   // If the load value is used only by N, replace it via CombineTo N.
   bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
   Combiner.CombineTo(N, ExtLoad);
   if (NoReplaceTrunc) {
     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
   } else {
     SDValue Trunc =
         DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
     Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1));
   }
   return SDValue(N, 0); // Return N so it doesn't get rechecked!
 }
 
 static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG,
                                        bool LegalOperations) {
   assert((N->getOpcode() == ISD::SIGN_EXTEND ||
           N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext");
 
   SDValue SetCC = N->getOperand(0);
   if (LegalOperations || SetCC.getOpcode() != ISD::SETCC ||
       !SetCC.hasOneUse() || SetCC.getValueType() != MVT::i1)
     return SDValue();
 
   SDValue X = SetCC.getOperand(0);
   SDValue Ones = SetCC.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
   EVT VT = N->getValueType(0);
   EVT XVT = X.getValueType();
   // setge X, C is canonicalized to setgt, so we do not need to match that
   // pattern. The setlt sibling is folded in SimplifySelectCC() because it does
   // not require the 'not' op.
   if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) {
     // Invert and smear/shift the sign bit:
     // sext i1 (setgt iN X, -1) --> sra (not X), (N - 1)
     // zext i1 (setgt iN X, -1) --> srl (not X), (N - 1)
     SDLoc DL(N);
     SDValue NotX = DAG.getNOT(DL, X, VT);
     SDValue ShiftAmount = DAG.getConstant(VT.getSizeInBits() - 1, DL, VT);
     auto ShiftOpcode = N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL;
     return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount);
   }
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
                                               LegalOperations))
     return SDValue(Res, 0);
 
   // fold (sext (sext x)) -> (sext x)
   // fold (sext (aext x)) -> (sext x)
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
     return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));
 
   if (N0.getOpcode() == ISD::TRUNCATE) {
     // fold (sext (truncate (load x))) -> (sext (smaller load x))
     // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
     if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
       SDNode *oye = N0.getOperand(0).getNode();
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
         // CombineTo deleted the truncate, if needed, but not what's under it.
         AddToWorklist(oye);
       }
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
 
     // See if the value being truncated is already sign extended.  If so, just
     // eliminate the trunc/sext pair.
     SDValue Op = N0.getOperand(0);
     unsigned OpBits   = Op.getScalarValueSizeInBits();
     unsigned MidBits  = N0.getScalarValueSizeInBits();
     unsigned DestBits = VT.getScalarSizeInBits();
     unsigned NumSignBits = DAG.ComputeNumSignBits(Op);
 
     if (OpBits == DestBits) {
       // Op is i32, Mid is i8, and Dest is i32.  If Op has more than 24 sign
       // bits, it is already ready.
       if (NumSignBits > DestBits-MidBits)
         return Op;
     } else if (OpBits < DestBits) {
       // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
       // bits, just sext from i32.
       if (NumSignBits > OpBits-MidBits)
         return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
     } else {
       // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
       // bits, just truncate to i32.
       if (NumSignBits > OpBits-MidBits)
         return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
     }
 
     // fold (sext (truncate x)) -> (sextinreg x).
     if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
                                                  N0.getValueType())) {
       if (OpBits < DestBits)
         Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
       else if (OpBits > DestBits)
         Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
                          DAG.getValueType(N0.getValueType()));
     }
   }
 
   // Try to simplify (sext (load x)).
   if (SDValue foldedExt =
           tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
                              ISD::SEXTLOAD, ISD::SIGN_EXTEND))
     return foldedExt;
 
   // fold (sext (load x)) to multiple smaller sextloads.
   // Only on illegal but splittable vectors.
   if (SDValue ExtLoad = CombineExtLoad(N))
     return ExtLoad;
 
   // Try to simplify (sext (sextload x)).
   if (SDValue foldedExt = tryToFoldExtOfExtload(
           DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD))
     return foldedExt;
 
   // fold (sext (and/or/xor (load x), cst)) ->
   //      (and/or/xor (sextload x), (sext cst))
   if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
     LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
     EVT MemVT = LN00->getMemoryVT();
     if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) &&
       LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) {
       SmallVector<SDNode*, 4> SetCCs;
       bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
                                              ISD::SIGN_EXTEND, SetCCs, TLI);
       if (DoXform) {
         SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT,
                                          LN00->getChain(), LN00->getBasePtr(),
                                          LN00->getMemoryVT(),
                                          LN00->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.sext(VT.getSizeInBits());
         SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
                                   ExtLoad, DAG.getConstant(Mask, DL, VT));
         ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND);
         bool NoReplaceTruncAnd = !N0.hasOneUse();
         bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
         CombineTo(N, And);
         // If N0 has multiple uses, change other uses as well.
         if (NoReplaceTruncAnd) {
           SDValue TruncAnd =
               DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
           CombineTo(N0.getNode(), TruncAnd);
         }
         if (NoReplaceTrunc) {
           DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
         } else {
           SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
                                       LN00->getValueType(0), ExtLoad);
           CombineTo(LN00, Trunc, ExtLoad.getValue(1));
         }
         return SDValue(N,0); // Return N so it doesn't get rechecked!
       }
     }
   }
 
   if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
     return V;
 
   if (N0.getOpcode() == ISD::SETCC) {
     SDValue N00 = N0.getOperand(0);
     SDValue N01 = N0.getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
     EVT N00VT = N0.getOperand(0).getValueType();
 
     // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
     // Only do this before legalize for now.
     if (VT.isVector() && !LegalOperations &&
         TLI.getBooleanContents(N00VT) ==
             TargetLowering::ZeroOrNegativeOneBooleanContent) {
       // On some architectures (such as SSE/NEON/etc) the SETCC result type is
       // of the same size as the compared operands. Only optimize sext(setcc())
       // if this is the case.
       EVT SVT = getSetCCResultType(N00VT);
 
       // We know that the # elements of the results is the same as the
       // # elements of the compare (and the # elements of the compare result
       // for that matter).  Check to see that they are the same size.  If so,
       // we know that the element size of the sext'd result matches the
       // element size of the compare operands.
       if (VT.getSizeInBits() == SVT.getSizeInBits())
         return DAG.getSetCC(DL, VT, N00, N01, CC);
 
       // If the desired elements are smaller or larger than the source
       // elements, we can use a matching integer vector type and then
       // truncate/sign extend.
       EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
       if (SVT == MatchingVecType) {
         SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
         return DAG.getSExtOrTrunc(VsetCC, DL, VT);
       }
     }
 
     // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
     // Here, T can be 1 or -1, depending on the type of the setcc and
     // getBooleanContents().
     unsigned SetCCWidth = N0.getScalarValueSizeInBits();
 
     // To determine the "true" side of the select, we need to know the high bit
     // of the value returned by the setcc if it evaluates to true.
     // If the type of the setcc is i1, then the true case of the select is just
     // sext(i1 1), that is, -1.
     // If the type of the setcc is larger (say, i8) then the value of the high
     // bit depends on getBooleanContents(), so ask TLI for a real "true" value
     // of the appropriate width.
     SDValue ExtTrueVal = (SetCCWidth == 1)
                              ? DAG.getAllOnesConstant(DL, VT)
                              : DAG.getBoolConstant(true, DL, VT, N00VT);
     SDValue Zero = DAG.getConstant(0, DL, VT);
     if (SDValue SCC =
             SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
       return SCC;
 
     if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) {
       EVT SetCCVT = getSetCCResultType(N00VT);
       // Don't do this transform for i1 because there's a select transform
       // that would reverse it.
       // TODO: We should not do this transform at all without a target hook
       // because a sext is likely cheaper than a select?
       if (SetCCVT.getScalarSizeInBits() != 1 &&
           (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) {
         SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
         return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
       }
     }
   }
 
   // fold (sext x) -> (zext x) if the sign bit is known zero.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
       DAG.SignBitIsZero(N0))
     return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
 
   if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
     return NewVSel;
 
   return SDValue();
 }
 
 // isTruncateOf - If N is a truncate of some other value, return true, record
 // the value being truncated in Op and which of Op's bits are zero/one in Known.
 // This function computes KnownBits to avoid a duplicated call to
 // computeKnownBits in the caller.
 static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
                          KnownBits &Known) {
   if (N->getOpcode() == ISD::TRUNCATE) {
     Op = N->getOperand(0);
     DAG.computeKnownBits(Op, Known);
     return true;
   }
 
   if (N->getOpcode() != ISD::SETCC || N->getValueType(0) != MVT::i1 ||
       cast<CondCodeSDNode>(N->getOperand(2))->get() != ISD::SETNE)
     return false;
 
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   assert(Op0.getValueType() == Op1.getValueType());
 
   if (isNullConstant(Op0))
     Op = Op1;
   else if (isNullConstant(Op1))
     Op = Op0;
   else
     return false;
 
   DAG.computeKnownBits(Op, Known);
 
   if (!(Known.Zero | 1).isAllOnesValue())
     return false;
 
   return true;
 }
 
 SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
                                               LegalOperations))
     return SDValue(Res, 0);
 
   // fold (zext (zext x)) -> (zext x)
   // fold (zext (aext x)) -> (zext x)
   if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
     return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT,
                        N0.getOperand(0));
 
   // fold (zext (truncate x)) -> (zext x) or
   //      (zext (truncate x)) -> (truncate x)
   // This is valid when the truncated bits of x are already zero.
   // FIXME: We should extend this to work for vectors too.
   SDValue Op;
   KnownBits Known;
   if (!VT.isVector() && isTruncateOf(DAG, N0, Op, Known)) {
     APInt TruncatedBits =
       (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ?
       APInt(Op.getValueSizeInBits(), 0) :
       APInt::getBitsSet(Op.getValueSizeInBits(),
                         N0.getValueSizeInBits(),
                         std::min(Op.getValueSizeInBits(),
                                  VT.getSizeInBits()));
     if (TruncatedBits.isSubsetOf(Known.Zero))
       return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
   }
 
   // fold (zext (truncate x)) -> (and x, mask)
   if (N0.getOpcode() == ISD::TRUNCATE) {
     // fold (zext (truncate (load x))) -> (zext (smaller load x))
     // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
     if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
       SDNode *oye = N0.getOperand(0).getNode();
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
         // CombineTo deleted the truncate, if needed, but not what's under it.
         AddToWorklist(oye);
       }
       return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
 
     EVT SrcVT = N0.getOperand(0).getValueType();
     EVT MinVT = N0.getValueType();
 
     // Try to mask before the extension to avoid having to generate a larger mask,
     // possibly over several sub-vectors.
     if (SrcVT.bitsLT(VT) && VT.isVector()) {
       if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&
                                TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
         SDValue Op = N0.getOperand(0);
         Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
         AddToWorklist(Op.getNode());
         SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
         // Transfer the debug info; the new node is equivalent to N0.
         DAG.transferDbgValues(N0, ZExtOrTrunc);
         return ZExtOrTrunc;
       }
     }
 
     if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
       SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
       AddToWorklist(Op.getNode());
       SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
       // We may safely transfer the debug info describing the truncate node over
       // to the equivalent and operation.
       DAG.transferDbgValues(N0, And);
       return And;
     }
   }
 
   // Fold (zext (and (trunc x), cst)) -> (and x, cst),
   // if either of the casts is not free.
   if (N0.getOpcode() == ISD::AND &&
       N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
       (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
                            N0.getValueType()) ||
        !TLI.isZExtFree(N0.getValueType(), VT))) {
     SDValue X = N0.getOperand(0).getOperand(0);
     X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask = Mask.zext(VT.getSizeInBits());
     SDLoc DL(N);
     return DAG.getNode(ISD::AND, DL, VT,
                        X, DAG.getConstant(Mask, DL, VT));
   }
 
   // Try to simplify (zext (load x)).
   if (SDValue foldedExt =
           tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
                              ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
     return foldedExt;
 
   // fold (zext (load x)) to multiple smaller zextloads.
   // Only on illegal but splittable vectors.
   if (SDValue ExtLoad = CombineExtLoad(N))
     return ExtLoad;
 
   // fold (zext (and/or/xor (load x), cst)) ->
   //      (and/or/xor (zextload x), (zext cst))
   // Unless (and (load x) cst) will match as a zextload already and has
   // additional users.
   if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
     LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
     EVT MemVT = LN00->getMemoryVT();
     if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) &&
         LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) {
       bool DoXform = true;
       SmallVector<SDNode*, 4> SetCCs;
       if (!N0.hasOneUse()) {
         if (N0.getOpcode() == ISD::AND) {
           auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
           EVT LoadResultTy = AndC->getValueType(0);
           EVT ExtVT;
           if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT))
             DoXform = false;
         }
       }
       if (DoXform)
         DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
                                           ISD::ZERO_EXTEND, SetCCs, TLI);
       if (DoXform) {
         SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT,
                                          LN00->getChain(), LN00->getBasePtr(),
                                          LN00->getMemoryVT(),
                                          LN00->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.zext(VT.getSizeInBits());
         SDLoc DL(N);
         SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
                                   ExtLoad, DAG.getConstant(Mask, DL, VT));
         ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
         bool NoReplaceTruncAnd = !N0.hasOneUse();
         bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
         CombineTo(N, And);
         // If N0 has multiple uses, change other uses as well.
         if (NoReplaceTruncAnd) {
           SDValue TruncAnd =
               DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
           CombineTo(N0.getNode(), TruncAnd);
         }
         if (NoReplaceTrunc) {
           DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
         } else {
           SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
                                       LN00->getValueType(0), ExtLoad);
           CombineTo(LN00, Trunc, ExtLoad.getValue(1));
         }
         return SDValue(N,0); // Return N so it doesn't get rechecked!
       }
     }
   }
 
   // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
   //      (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
   if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N))
     return ZExtLoad;
 
   // Try to simplify (zext (zextload x)).
   if (SDValue foldedExt = tryToFoldExtOfExtload(
           DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD))
     return foldedExt;
 
   if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
     return V;
 
   if (N0.getOpcode() == ISD::SETCC) {
     // Only do this before legalize for now.
     if (!LegalOperations && VT.isVector() &&
         N0.getValueType().getVectorElementType() == MVT::i1) {
       EVT N00VT = N0.getOperand(0).getValueType();
       if (getSetCCResultType(N00VT) == N0.getValueType())
         return SDValue();
 
       // We know that the # elements of the results is the same as the #
       // elements of the compare (and the # elements of the compare result for
       // that matter). Check to see that they are the same size. If so, we know
       // that the element size of the sext'd result matches the element size of
       // the compare operands.
       SDLoc DL(N);
       SDValue VecOnes = DAG.getConstant(1, DL, VT);
       if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
         // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors.
         SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
                                      N0.getOperand(1), N0.getOperand(2));
         return DAG.getNode(ISD::AND, DL, VT, VSetCC, VecOnes);
       }
 
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/sign extend.
       EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
       SDValue VsetCC =
           DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
                       N0.getOperand(1), N0.getOperand(2));
       return DAG.getNode(ISD::AND, DL, VT, DAG.getSExtOrTrunc(VsetCC, DL, VT),
                          VecOnes);
     }
 
     // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
     SDLoc DL(N);
     if (SDValue SCC = SimplifySelectCC(
             DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
             DAG.getConstant(0, DL, VT),
             cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
       return SCC;
   }
 
   // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
   if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
       isa<ConstantSDNode>(N0.getOperand(1)) &&
       N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
       N0.hasOneUse()) {
     SDValue ShAmt = N0.getOperand(1);
     unsigned ShAmtVal = cast<ConstantSDNode>(ShAmt)->getZExtValue();
     if (N0.getOpcode() == ISD::SHL) {
       SDValue InnerZExt = N0.getOperand(0);
       // If the original shl may be shifting out bits, do not perform this
       // transformation.
       unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() -
         InnerZExt.getOperand(0).getValueSizeInBits();
       if (ShAmtVal > KnownZeroBits)
         return SDValue();
     }
 
     SDLoc DL(N);
 
     // Ensure that the shift amount is wide enough for the shifted value.
     if (VT.getSizeInBits() >= 256)
       ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);
 
     return DAG.getNode(N0.getOpcode(), DL, VT,
                        DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)),
                        ShAmt);
   }
 
   if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
     return NewVSel;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
                                               LegalOperations))
     return SDValue(Res, 0);
 
   // fold (aext (aext x)) -> (aext x)
   // fold (aext (zext x)) -> (zext x)
   // fold (aext (sext x)) -> (sext x)
   if (N0.getOpcode() == ISD::ANY_EXTEND  ||
       N0.getOpcode() == ISD::ZERO_EXTEND ||
       N0.getOpcode() == ISD::SIGN_EXTEND)
     return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
 
   // fold (aext (truncate (load x))) -> (aext (smaller load x))
   // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
   if (N0.getOpcode() == ISD::TRUNCATE) {
     if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
       SDNode *oye = N0.getOperand(0).getNode();
       if (NarrowLoad.getNode() != N0.getNode()) {
         CombineTo(N0.getNode(), NarrowLoad);
         // CombineTo deleted the truncate, if needed, but not what's under it.
         AddToWorklist(oye);
       }
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
 
   // fold (aext (truncate x))
   if (N0.getOpcode() == ISD::TRUNCATE)
     return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
 
   // Fold (aext (and (trunc x), cst)) -> (and x, cst)
   // if the trunc is not free.
   if (N0.getOpcode() == ISD::AND &&
       N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
       !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
                           N0.getValueType())) {
     SDLoc DL(N);
     SDValue X = N0.getOperand(0).getOperand(0);
     X = DAG.getAnyExtOrTrunc(X, DL, VT);
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask = Mask.zext(VT.getSizeInBits());
     return DAG.getNode(ISD::AND, DL, VT,
                        X, DAG.getConstant(Mask, DL, VT));
   }
 
   // fold (aext (load x)) -> (aext (truncate (extload x)))
   // None of the supported targets knows how to perform load and any_ext
   // on vectors in one instruction.  We only perform this transformation on
   // scalars.
   if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
       TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
       DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs,
                                         TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
       ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
       // If the load value is used only by N, replace it via CombineTo N.
       bool NoReplaceTrunc = N0.hasOneUse();
       CombineTo(N, ExtLoad);
       if (NoReplaceTrunc) {
         DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
       } else {
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                     N0.getValueType(), ExtLoad);
         CombineTo(LN0, Trunc, ExtLoad.getValue(1));
       }
       return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
 
   // fold (aext (zextload x)) -> (aext (truncate (zextload x)))
   // fold (aext (sextload x)) -> (aext (truncate (sextload x)))
   // fold (aext ( extload x)) -> (aext (truncate (extload  x)))
   if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) &&
       ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     ISD::LoadExtType ExtType = LN0->getExtensionType();
     EVT MemVT = LN0->getMemoryVT();
     if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
                                        VT, LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
 
   if (N0.getOpcode() == ISD::SETCC) {
     // For vectors:
     // aext(setcc) -> vsetcc
     // aext(setcc) -> truncate(vsetcc)
     // aext(setcc) -> aext(vsetcc)
     // Only do this before legalize for now.
     if (VT.isVector() && !LegalOperations) {
       EVT N00VT = N0.getOperand(0).getValueType();
       if (getSetCCResultType(N00VT) == N0.getValueType())
         return SDValue();
 
       // We know that the # elements of the results is the same as the
       // # elements of the compare (and the # elements of the compare result
       // for that matter).  Check to see that they are the same size.  If so,
       // we know that the element size of the sext'd result matches the
       // element size of the compare operands.
       if (VT.getSizeInBits() == N00VT.getSizeInBits())
         return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
                              N0.getOperand(1),
                              cast<CondCodeSDNode>(N0.getOperand(2))->get());
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/any extend
       else {
         EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
         SDValue VsetCC =
           DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
                         N0.getOperand(1),
                         cast<CondCodeSDNode>(N0.getOperand(2))->get());
         return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
       }
     }
 
     // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
     SDLoc DL(N);
     if (SDValue SCC = SimplifySelectCC(
             DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
             DAG.getConstant(0, DL, VT),
             cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
       return SCC;
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitAssertExt(SDNode *N) {
   unsigned Opcode = N->getOpcode();
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT AssertVT = cast<VTSDNode>(N1)->getVT();
 
   // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt)
   if (N0.getOpcode() == Opcode &&
       AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
     return N0;
 
   if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
       N0.getOperand(0).getOpcode() == Opcode) {
     // We have an assert, truncate, assert sandwich. Make one stronger assert
     // by asserting on the smallest asserted type to the larger source type.
     // This eliminates the later assert:
     // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN
     // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN
     SDValue BigA = N0.getOperand(0);
     EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
     assert(BigA_AssertVT.bitsLE(N0.getValueType()) &&
            "Asserting zero/sign-extended bits to a type larger than the "
            "truncated destination does not provide information");
 
     SDLoc DL(N);
     EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT;
     SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT);
     SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
                                     BigA.getOperand(0), MinAssertVTVal);
     return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
   }
 
   return SDValue();
 }
 
 /// If the result of a wider load is shifted to right of N  bits and then
 /// truncated to a narrower type and where N is a multiple of number of bits of
 /// the narrower type, transform it to a narrower load from address + N / num of
 /// bits of new type. Also narrow the load if the result is masked with an AND
 /// to effectively produce a smaller type. If the result is to be extended, also
 /// fold the extension to form a extending load.
 SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
   unsigned Opc = N->getOpcode();
 
   ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT ExtVT = VT;
 
   // This transformation isn't valid for vector loads.
   if (VT.isVector())
     return SDValue();
 
   // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
   // extended to VT.
   if (Opc == ISD::SIGN_EXTEND_INREG) {
     ExtType = ISD::SEXTLOAD;
     ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
   } else if (Opc == ISD::SRL) {
     // Another special-case: SRL is basically zero-extending a narrower value,
     // or it maybe shifting a higher subword, half or byte into the lowest
     // bits.
     ExtType = ISD::ZEXTLOAD;
     N0 = SDValue(N, 0);
 
     auto *LN0 = dyn_cast<LoadSDNode>(N0.getOperand(0));
     auto *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
     if (!N01 || !LN0)
       return SDValue();
 
     uint64_t ShiftAmt = N01->getZExtValue();
     uint64_t MemoryWidth = LN0->getMemoryVT().getSizeInBits();
     if (LN0->getExtensionType() != ISD::SEXTLOAD && MemoryWidth > ShiftAmt)
       ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShiftAmt);
     else
       ExtVT = EVT::getIntegerVT(*DAG.getContext(),
                                 VT.getSizeInBits() - ShiftAmt);
   } else if (Opc == ISD::AND) {
     // An AND with a constant mask is the same as a truncate + zero-extend.
     auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
     if (!AndC || !AndC->getAPIntValue().isMask())
       return SDValue();
 
     unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();
     ExtType = ISD::ZEXTLOAD;
     ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
   }
 
   unsigned ShAmt = 0;
   if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
     SDValue SRL = N0;
     if (auto *ConstShift = dyn_cast<ConstantSDNode>(SRL.getOperand(1))) {
       ShAmt = ConstShift->getZExtValue();
       unsigned EVTBits = ExtVT.getSizeInBits();
       // Is the shift amount a multiple of size of VT?
       if ((ShAmt & (EVTBits-1)) == 0) {
         N0 = N0.getOperand(0);
         // Is the load width a multiple of size of VT?
         if ((N0.getValueSizeInBits() & (EVTBits-1)) != 0)
           return SDValue();
       }
 
       // At this point, we must have a load or else we can't do the transform.
       if (!isa<LoadSDNode>(N0)) return SDValue();
 
       auto *LN0 = cast<LoadSDNode>(N0);
 
       // Because a SRL must be assumed to *need* to zero-extend the high bits
       // (as opposed to anyext the high bits), we can't combine the zextload
       // lowering of SRL and an sextload.
       if (LN0->getExtensionType() == ISD::SEXTLOAD)
         return SDValue();
 
       // If the shift amount is larger than the input type then we're not
       // accessing any of the loaded bytes.  If the load was a zextload/extload
       // then the result of the shift+trunc is zero/undef (handled elsewhere).
       if (ShAmt >= LN0->getMemoryVT().getSizeInBits())
         return SDValue();
 
       // If the SRL is only used by a masking AND, we may be able to adjust
       // the ExtVT to make the AND redundant.
       SDNode *Mask = *(SRL->use_begin());
       if (Mask->getOpcode() == ISD::AND &&
           isa<ConstantSDNode>(Mask->getOperand(1))) {
         const APInt &ShiftMask =
           cast<ConstantSDNode>(Mask->getOperand(1))->getAPIntValue();
         if (ShiftMask.isMask()) {
           EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
                                            ShiftMask.countTrailingOnes());
           // If the mask is smaller, recompute the type.
           if ((ExtVT.getSizeInBits() > MaskedVT.getSizeInBits()) &&
               TLI.isLoadExtLegal(ExtType, N0.getValueType(), MaskedVT))
             ExtVT = MaskedVT;
         }
       }
     }
   }
 
   // If the load is shifted left (and the result isn't shifted back right),
   // we can fold the truncate through the shift.
   unsigned ShLeftAmt = 0;
   if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
       ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
     if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
       ShLeftAmt = N01->getZExtValue();
       N0 = N0.getOperand(0);
     }
   }
 
   // If we haven't found a load, we can't narrow it.
   if (!isa<LoadSDNode>(N0))
     return SDValue();
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   if (!isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
     return SDValue();
 
   // For big endian targets, we need to adjust the offset to the pointer to
   // load the correct bytes.
   if (DAG.getDataLayout().isBigEndian()) {
     unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
     unsigned EVTStoreBits = ExtVT.getStoreSizeInBits();
     ShAmt = LVTStoreBits - EVTStoreBits - ShAmt;
   }
 
   EVT PtrType = N0.getOperand(1).getValueType();
   uint64_t PtrOff = ShAmt / 8;
   unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
   SDLoc DL(LN0);
   // The original load itself didn't wrap, so an offset within it doesn't.
   SDNodeFlags Flags;
   Flags.setNoUnsignedWrap(true);
   SDValue NewPtr = DAG.getNode(ISD::ADD, DL,
                                PtrType, LN0->getBasePtr(),
                                DAG.getConstant(PtrOff, DL, PtrType),
                                Flags);
   AddToWorklist(NewPtr.getNode());
 
   SDValue Load;
   if (ExtType == ISD::NON_EXTLOAD)
     Load = DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
                        LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
                        LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
   else
     Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(), NewPtr,
                           LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
                           NewAlign, LN0->getMemOperand()->getFlags(),
                           LN0->getAAInfo());
 
   // Replace the old load's chain with the new load's chain.
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
 
   // Shift the result left, if we've swallowed a left shift.
   SDValue Result = Load;
   if (ShLeftAmt != 0) {
     EVT ShImmTy = getShiftAmountTy(Result.getValueType());
     if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt))
       ShImmTy = VT;
     // If the shift amount is as large as the result size (but, presumably,
     // no larger than the source) then the useful bits of the result are
     // zero; we can't simply return the shortened shift, because the result
     // of that operation is undefined.
     SDLoc DL(N0);
     if (ShLeftAmt >= VT.getSizeInBits())
       Result = DAG.getConstant(0, DL, VT);
     else
       Result = DAG.getNode(ISD::SHL, DL, VT,
                           Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
   }
 
   // Return the new loaded value.
   return Result;
 }
 
 SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   EVT EVT = cast<VTSDNode>(N1)->getVT();
   unsigned VTBits = VT.getScalarSizeInBits();
   unsigned EVTBits = EVT.getScalarSizeInBits();
 
   if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
   // fold (sext_in_reg c1) -> c1
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
 
   // If the input is already sign extended, just drop the extension.
   if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)
     return N0;
 
   // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
   if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
       EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
                        N0.getOperand(0), N1);
 
   // fold (sext_in_reg (sext x)) -> (sext x)
   // fold (sext_in_reg (aext x)) -> (sext x)
   // if x is small enough.
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getScalarValueSizeInBits() <= EVTBits &&
         (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
   }
 
   // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x)
   if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
        N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ||
        N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
       N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) {
     if (!LegalOperations ||
         TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))
       return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT);
   }
 
   // fold (sext_in_reg (zext x)) -> (sext x)
   // iff we are extending the source sign bit.
   if (N0.getOpcode() == ISD::ZERO_EXTEND) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getScalarValueSizeInBits() == EVTBits &&
         (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
   }
 
   // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
   if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1)))
     return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType());
 
   // fold operands of sext_in_reg based on knowledge that the top bits are not
   // demanded.
   if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // fold (sext_in_reg (load x)) -> (smaller sextload x)
   // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
   if (SDValue NarrowLoad = ReduceLoadWidth(N))
     return NarrowLoad;
 
   // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
   // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
   // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
   if (N0.getOpcode() == ISD::SRL) {
     if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
       if (ShAmt->getZExtValue()+EVTBits <= VTBits) {
         // We can turn this into an SRA iff the input to the SRL is already sign
         // extended enough.
         unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
         if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
           return DAG.getNode(ISD::SRA, SDLoc(N), VT,
                              N0.getOperand(0), N0.getOperand(1));
       }
   }
 
   // fold (sext_inreg (extload x)) -> (sextload x)
   // If sextload is not supported by target, we can only do the combine when
   // load has one use. Doing otherwise can block folding the extload with other
   // extends that the target does support.
   if (ISD::isEXTLoad(N0.getNode()) &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile() &&
         N0.hasOneUse()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
                                      LN0->getBasePtr(), EVT,
                                      LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
     AddToWorklist(ExtLoad.getNode());
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
   }
   // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
   if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
       N0.hasOneUse() &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
                                      LN0->getBasePtr(), EVT,
                                      LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
   }
 
   // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
   if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) {
     if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
                                            N0.getOperand(1), false))
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
                          BSwap, N1);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
   if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
                                               LegalOperations))
     return SDValue(Res, 0);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
   if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
                                               LegalOperations))
     return SDValue(Res, 0);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   bool isLE = DAG.getDataLayout().isLittleEndian();
 
   // noop truncate
   if (N0.getValueType() == N->getValueType(0))
     return N0;
 
   // fold (truncate (truncate x)) -> (truncate x)
   if (N0.getOpcode() == ISD::TRUNCATE)
     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
 
   // fold (truncate c1) -> c1
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
     SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
     if (C.getNode() != N)
       return C;
   }
 
   // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
   if (N0.getOpcode() == ISD::ZERO_EXTEND ||
       N0.getOpcode() == ISD::SIGN_EXTEND ||
       N0.getOpcode() == ISD::ANY_EXTEND) {
     // if the source is smaller than the dest, we still need an extend.
     if (N0.getOperand(0).getValueType().bitsLT(VT))
       return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
     // if the source is larger than the dest, than we just need the truncate.
     if (N0.getOperand(0).getValueType().bitsGT(VT))
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
     // if the source and dest are the same type, we can drop both the extend
     // and the truncate.
     return N0.getOperand(0);
   }
 
   // If this is anyext(trunc), don't fold it, allow ourselves to be folded.
   if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
     return SDValue();
 
   // Fold extract-and-trunc into a narrow extract. For example:
   //   i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
   //   i32 y = TRUNCATE(i64 x)
   //        -- becomes --
   //   v16i8 b = BITCAST (v2i64 val)
   //   i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8)
   //
   // Note: We only run this optimization after type legalization (which often
   // creates this pattern) and before operation legalization after which
   // we need to be more careful about the vector instructions that we generate.
   if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
     EVT VecTy = N0.getOperand(0).getValueType();
     EVT ExTy = N0.getValueType();
     EVT TrTy = N->getValueType(0);
 
     unsigned NumElem = VecTy.getVectorNumElements();
     unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();
 
     EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem);
     assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
 
     SDValue EltNo = N0->getOperand(1);
     if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
       int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
       EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
       int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
 
       SDLoc DL(N);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
                          DAG.getBitcast(NVT, N0.getOperand(0)),
                          DAG.getConstant(Index, DL, IndexTy));
     }
   }
 
   // trunc (select c, a, b) -> select c, (trunc a), (trunc b)
   if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
     EVT SrcVT = N0.getValueType();
     if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
         TLI.isTruncateFree(SrcVT, VT)) {
       SDLoc SL(N0);
       SDValue Cond = N0.getOperand(0);
       SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
       SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
       return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
     }
   }
 
   // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits()
   if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) &&
       TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
     SDValue Amt = N0.getOperand(1);
     KnownBits Known;
     DAG.computeKnownBits(Amt, Known);
     unsigned Size = VT.getScalarSizeInBits();
     if (Known.getBitWidth() - Known.countMinLeadingZeros() <= Log2_32(Size)) {
       SDLoc SL(N);
       EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
 
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
       if (AmtVT != Amt.getValueType()) {
         Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
         AddToWorklist(Amt.getNode());
       }
       return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
     }
   }
 
   // Fold a series of buildvector, bitcast, and truncate if possible.
   // For example fold
   //   (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
   //   (2xi32 (buildvector x, y)).
   if (Level == AfterLegalizeVectorOps && VT.isVector() &&
       N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
       N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
       N0.getOperand(0).hasOneUse()) {
     SDValue BuildVect = N0.getOperand(0);
     EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
     EVT TruncVecEltTy = VT.getVectorElementType();
 
     // Check that the element types match.
     if (BuildVectEltTy == TruncVecEltTy) {
       // Now we only need to compute the offset of the truncated elements.
       unsigned BuildVecNumElts =  BuildVect.getNumOperands();
       unsigned TruncVecNumElts = VT.getVectorNumElements();
       unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;
 
       assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
              "Invalid number of elements");
 
       SmallVector<SDValue, 8> Opnds;
       for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
         Opnds.push_back(BuildVect.getOperand(i));
 
       return DAG.getBuildVector(VT, SDLoc(N), Opnds);
     }
   }
 
   // See if we can simplify the input to this truncate through knowledge that
   // only the low bits are being used.
   // For example "trunc (or (shl x, 8), y)" // -> trunc y
   // Currently we only perform this optimization on scalars because vectors
   // may have different active low bits.
   if (!VT.isVector()) {
     APInt Mask =
         APInt::getLowBitsSet(N0.getValueSizeInBits(), VT.getSizeInBits());
     if (SDValue Shorter = DAG.GetDemandedBits(N0, Mask))
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
   }
 
   // fold (truncate (load x)) -> (smaller load x)
   // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
   if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
     if (SDValue Reduced = ReduceLoadWidth(N))
       return Reduced;
 
     // Handle the case where the load remains an extending load even
     // after truncation.
     if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       if (!LN0->isVolatile() &&
           LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) {
         SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
                                          VT, LN0->getChain(), LN0->getBasePtr(),
                                          LN0->getMemoryVT(),
                                          LN0->getMemOperand());
         DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1));
         return NewLoad;
       }
     }
   }
 
   // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
   // where ... are all 'undef'.
   if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
     SmallVector<EVT, 8> VTs;
     SDValue V;
     unsigned Idx = 0;
     unsigned NumDefs = 0;
 
     for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
       SDValue X = N0.getOperand(i);
       if (!X.isUndef()) {
         V = X;
         Idx = i;
         NumDefs++;
       }
       // Stop if more than one members are non-undef.
       if (NumDefs > 1)
         break;
       VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
                                      VT.getVectorElementType(),
                                      X.getValueType().getVectorNumElements()));
     }
 
     if (NumDefs == 0)
       return DAG.getUNDEF(VT);
 
     if (NumDefs == 1) {
       assert(V.getNode() && "The single defined operand is empty!");
       SmallVector<SDValue, 8> Opnds;
       for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
         if (i != Idx) {
           Opnds.push_back(DAG.getUNDEF(VTs[i]));
           continue;
         }
         SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
         AddToWorklist(NV.getNode());
         Opnds.push_back(NV);
       }
       return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
     }
   }
 
   // Fold truncate of a bitcast of a vector to an extract of the low vector
   // element.
   //
   // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx
   if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
     SDValue VecSrc = N0.getOperand(0);
     EVT SrcVT = VecSrc.getValueType();
     if (SrcVT.isVector() && SrcVT.getScalarType() == VT &&
         (!LegalOperations ||
          TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, SrcVT))) {
       SDLoc SL(N);
 
       EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
       unsigned Idx = isLE ? 0 : SrcVT.getVectorNumElements() - 1;
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT,
                          VecSrc, DAG.getConstant(Idx, SL, IdxVT));
     }
   }
 
   // Simplify the operands using demanded-bits information.
   if (!VT.isVector() &&
       SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
   // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
   // When the adde's carry is not used.
   if ((N0.getOpcode() == ISD::ADDE || N0.getOpcode() == ISD::ADDCARRY) &&
       N0.hasOneUse() && !N0.getNode()->hasAnyUseOfValue(1) &&
       (!LegalOperations || TLI.isOperationLegal(N0.getOpcode(), VT))) {
     SDLoc SL(N);
     auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
     auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
     auto VTs = DAG.getVTList(VT, N0->getValueType(1));
     return DAG.getNode(N0.getOpcode(), SL, VTs, X, Y, N0.getOperand(2));
   }
 
   // fold (truncate (extract_subvector(ext x))) ->
   //      (extract_subvector x)
   // TODO: This can be generalized to cover cases where the truncate and extract
   // do not fully cancel each other out.
   if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getOpcode() == ISD::SIGN_EXTEND ||
         N00.getOpcode() == ISD::ZERO_EXTEND ||
         N00.getOpcode() == ISD::ANY_EXTEND) {
       if (N00.getOperand(0)->getValueType(0).getVectorElementType() ==
           VT.getVectorElementType())
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT,
                            N00.getOperand(0), N0.getOperand(1));
     }
   }
 
   if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
     return NewVSel;
 
   return SDValue();
 }
 
 static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
   SDValue Elt = N->getOperand(i);
   if (Elt.getOpcode() != ISD::MERGE_VALUES)
     return Elt.getNode();
   return Elt.getOperand(Elt.getResNo()).getNode();
 }
 
 /// build_pair (load, load) -> load
 /// if load locations are consecutive.
 SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   assert(N->getOpcode() == ISD::BUILD_PAIR);
 
   LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
   LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
 
   // A BUILD_PAIR is always having the least significant part in elt 0 and the
   // most significant part in elt 1. So when combining into one large load, we
   // need to consider the endianness.
   if (DAG.getDataLayout().isBigEndian())
     std::swap(LD1, LD2);
 
   if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() ||
       LD1->getAddressSpace() != LD2->getAddressSpace())
     return SDValue();
   EVT LD1VT = LD1->getValueType(0);
   unsigned LD1Bytes = LD1VT.getStoreSize();
   if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&
       DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {
     unsigned Align = LD1->getAlignment();
     unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
         VT.getTypeForEVT(*DAG.getContext()));
 
     if (NewAlign <= Align &&
         (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
       return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
                          LD1->getPointerInfo(), Align);
   }
 
   return SDValue();
 }
 
 static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
   // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
   // and Lo parts; on big-endian machines it doesn't.
   return DAG.getDataLayout().isBigEndian() ? 1 : 0;
 }
 
 static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
                                     const TargetLowering &TLI) {
   // If this is not a bitcast to an FP type or if the target doesn't have
   // IEEE754-compliant FP logic, we're done.
   EVT VT = N->getValueType(0);
   if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT))
     return SDValue();
 
   // TODO: Use splat values for the constant-checking below and remove this
   // restriction.
   SDValue N0 = N->getOperand(0);
   EVT SourceVT = N0.getValueType();
   if (SourceVT.isVector())
     return SDValue();
 
   unsigned FPOpcode;
   APInt SignMask;
   switch (N0.getOpcode()) {
   case ISD::AND:
     FPOpcode = ISD::FABS;
     SignMask = ~APInt::getSignMask(SourceVT.getSizeInBits());
     break;
   case ISD::XOR:
     FPOpcode = ISD::FNEG;
     SignMask = APInt::getSignMask(SourceVT.getSizeInBits());
     break;
   // TODO: ISD::OR --> ISD::FNABS?
   default:
     return SDValue();
   }
 
   // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X
   // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X
   SDValue LogicOp0 = N0.getOperand(0);
   ConstantSDNode *LogicOp1 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
   if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask &&
       LogicOp0.getOpcode() == ISD::BITCAST &&
       LogicOp0->getOperand(0).getValueType() == VT)
     return DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0->getOperand(0));
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   if (N0.isUndef())
     return DAG.getUNDEF(VT);
 
   // If the input is a BUILD_VECTOR with all constant elements, fold this now.
   // Only do this before legalize, since afterward the target may be depending
   // on the bitconvert.
   // First check to see if this is all constant.
   if (!LegalTypes &&
       N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
       VT.isVector()) {
     bool isSimple = cast<BuildVectorSDNode>(N0)->isConstant();
 
     EVT DestEltVT = N->getValueType(0).getVectorElementType();
     assert(!DestEltVT.isVector() &&
            "Element type of vector ValueType must not be vector!");
     if (isSimple)
       return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), DestEltVT);
   }
 
   // If the input is a constant, let getNode fold it.
   // We always need to check that this is just a fp -> int or int -> conversion
   // otherwise we will get back N which will confuse the caller into thinking
   // we used CombineTo. This can block target combines from running. If we can't
   // allowed legal operations, we need to ensure the resulting operation will be
   // legal.
   // TODO: Maybe we should check that the return value isn't N explicitly?
   if ((isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
        (!LegalOperations || TLI.isOperationLegal(ISD::ConstantFP, VT))) ||
       (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
        (!LegalOperations || TLI.isOperationLegal(ISD::Constant, VT))))
     return DAG.getBitcast(VT, N0);
 
   // (conv (conv x, t1), t2) -> (conv x, t2)
   if (N0.getOpcode() == ISD::BITCAST)
     return DAG.getBitcast(VT, N0.getOperand(0));
 
   // fold (conv (load x)) -> (load (conv*)x)
   // If the resultant load doesn't need a higher alignment than the original!
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile() &&
       // Do not remove the cast if the types differ in endian layout.
       TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
           TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
       (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
       TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     unsigned OrigAlign = LN0->getAlignment();
 
     bool Fast = false;
     if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
                                LN0->getAddressSpace(), OrigAlign, &Fast) &&
         Fast) {
       SDValue Load =
           DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
                       LN0->getPointerInfo(), OrigAlign,
                       LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
       DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
       return Load;
     }
   }
 
   if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
     return V;
 
   // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
   // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
   //
   // For ppc_fp128:
   // fold (bitcast (fneg x)) ->
   //     flipbit = signbit
   //     (xor (bitcast x) (build_pair flipbit, flipbit))
   //
   // fold (bitcast (fabs x)) ->
   //     flipbit = (and (extract_element (bitcast x), 0), signbit)
   //     (xor (bitcast x) (build_pair flipbit, flipbit))
   // This often reduces constant pool loads.
   if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
        (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
       N0.getNode()->hasOneUse() && VT.isInteger() &&
       !VT.isVector() && !N0.getValueType().isVector()) {
     SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
     AddToWorklist(NewConv.getNode());
 
     SDLoc DL(N);
     if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
       assert(VT.getSizeInBits() == 128);
       SDValue SignBit = DAG.getConstant(
           APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
       SDValue FlipBit;
       if (N0.getOpcode() == ISD::FNEG) {
         FlipBit = SignBit;
         AddToWorklist(FlipBit.getNode());
       } else {
         assert(N0.getOpcode() == ISD::FABS);
         SDValue Hi =
             DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
                         DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
                                               SDLoc(NewConv)));
         AddToWorklist(Hi.getNode());
         FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
         AddToWorklist(FlipBit.getNode());
       }
       SDValue FlipBits =
           DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
       AddToWorklist(FlipBits.getNode());
       return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
     }
     APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
     if (N0.getOpcode() == ISD::FNEG)
       return DAG.getNode(ISD::XOR, DL, VT,
                          NewConv, DAG.getConstant(SignBit, DL, VT));
     assert(N0.getOpcode() == ISD::FABS);
     return DAG.getNode(ISD::AND, DL, VT,
                        NewConv, DAG.getConstant(~SignBit, DL, VT));
   }
 
   // fold (bitconvert (fcopysign cst, x)) ->
   //         (or (and (bitconvert x), sign), (and cst, (not sign)))
   // Note that we don't handle (copysign x, cst) because this can always be
   // folded to an fneg or fabs.
   //
   // For ppc_fp128:
   // fold (bitcast (fcopysign cst, x)) ->
   //     flipbit = (and (extract_element
   //                     (xor (bitcast cst), (bitcast x)), 0),
   //                    signbit)
   //     (xor (bitcast cst) (build_pair flipbit, flipbit))
   if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
       isa<ConstantFPSDNode>(N0.getOperand(0)) &&
       VT.isInteger() && !VT.isVector()) {
     unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
     EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
     if (isTypeLegal(IntXVT)) {
       SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1));
       AddToWorklist(X.getNode());
 
       // If X has a different width than the result/lhs, sext it or truncate it.
       unsigned VTWidth = VT.getSizeInBits();
       if (OrigXWidth < VTWidth) {
         X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
         AddToWorklist(X.getNode());
       } else if (OrigXWidth > VTWidth) {
         // To get the sign bit in the right place, we have to shift it right
         // before truncating.
         SDLoc DL(X);
         X = DAG.getNode(ISD::SRL, DL,
                         X.getValueType(), X,
                         DAG.getConstant(OrigXWidth-VTWidth, DL,
                                         X.getValueType()));
         AddToWorklist(X.getNode());
         X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
         AddToWorklist(X.getNode());
       }
 
       if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
         APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2);
         SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
         AddToWorklist(Cst.getNode());
         SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
         AddToWorklist(X.getNode());
         SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
         AddToWorklist(XorResult.getNode());
         SDValue XorResult64 = DAG.getNode(
             ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
             DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
                                   SDLoc(XorResult)));
         AddToWorklist(XorResult64.getNode());
         SDValue FlipBit =
             DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
                         DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
         AddToWorklist(FlipBit.getNode());
         SDValue FlipBits =
             DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
         AddToWorklist(FlipBits.getNode());
         return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
       }
       APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
       X = DAG.getNode(ISD::AND, SDLoc(X), VT,
                       X, DAG.getConstant(SignBit, SDLoc(X), VT));
       AddToWorklist(X.getNode());
 
       SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
       Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
                         Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT));
       AddToWorklist(Cst.getNode());
 
       return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
     }
   }
 
   // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
   if (N0.getOpcode() == ISD::BUILD_PAIR)
     if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
       return CombineLD;
 
   // Remove double bitcasts from shuffles - this is often a legacy of
   // XformToShuffleWithZero being used to combine bitmaskings (of
   // float vectors bitcast to integer vectors) into shuffles.
   // bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1)
   if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() &&
       N0->getOpcode() == ISD::VECTOR_SHUFFLE &&
       VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() &&
       !(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) {
     ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);
 
     // If operands are a bitcast, peek through if it casts the original VT.
     // If operands are a constant, just bitcast back to original VT.
     auto PeekThroughBitcast = [&](SDValue Op) {
       if (Op.getOpcode() == ISD::BITCAST &&
           Op.getOperand(0).getValueType() == VT)
         return SDValue(Op.getOperand(0));
       if (Op.isUndef() || ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
           ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
         return DAG.getBitcast(VT, Op);
       return SDValue();
     };
 
     // FIXME: If either input vector is bitcast, try to convert the shuffle to
     // the result type of this bitcast. This would eliminate at least one
     // bitcast. See the transform in InstCombine.
     SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));
     SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));
     if (!(SV0 && SV1))
       return SDValue();
 
     int MaskScale =
         VT.getVectorNumElements() / N0.getValueType().getVectorNumElements();
     SmallVector<int, 8> NewMask;
     for (int M : SVN->getMask())
       for (int i = 0; i != MaskScale; ++i)
         NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);
 
     bool LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
     if (!LegalMask) {
       std::swap(SV0, SV1);
       ShuffleVectorSDNode::commuteMask(NewMask);
       LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
     }
 
     if (LegalMask)
       return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
   EVT VT = N->getValueType(0);
   return CombineConsecutiveLoads(N, VT);
 }
 
 /// We know that BV is a build_vector node with Constant, ConstantFP or Undef
 /// operands. DstEltVT indicates the destination element value type.
 SDValue DAGCombiner::
 ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
 
   // If this is already the right type, we're done.
   if (SrcEltVT == DstEltVT) return SDValue(BV, 0);
 
   unsigned SrcBitSize = SrcEltVT.getSizeInBits();
   unsigned DstBitSize = DstEltVT.getSizeInBits();
 
   // If this is a conversion of N elements of one type to N elements of another
   // type, convert each element.  This handles FP<->INT cases.
   if (SrcBitSize == DstBitSize) {
     EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
                               BV->getValueType(0).getVectorNumElements());
 
     // Due to the FP element handling below calling this routine recursively,
     // we can end up with a scalar-to-vector node here.
     if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR)
       return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT,
                          DAG.getBitcast(DstEltVT, BV->getOperand(0)));
 
     SmallVector<SDValue, 8> Ops;
     for (SDValue Op : BV->op_values()) {
       // If the vector element type is not legal, the BUILD_VECTOR operands
       // are promoted and implicitly truncated.  Make that explicit here.
       if (Op.getValueType() != SrcEltVT)
         Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
       Ops.push_back(DAG.getBitcast(DstEltVT, Op));
       AddToWorklist(Ops.back().getNode());
     }
     return DAG.getBuildVector(VT, SDLoc(BV), Ops);
   }
 
   // Otherwise, we're growing or shrinking the elements.  To avoid having to
   // handle annoying details of growing/shrinking FP values, we convert them to
   // int first.
   if (SrcEltVT.isFloatingPoint()) {
     // Convert the input float vector to a int vector where the elements are the
     // same sizes.
     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
     BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
     SrcEltVT = IntVT;
   }
 
   // Now we know the input is an integer vector.  If the output is a FP type,
   // convert to integer first, then to FP of the right size.
   if (DstEltVT.isFloatingPoint()) {
     EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
     SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
 
     // Next, convert to FP elements of the same size.
     return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
   }
 
   SDLoc DL(BV);
 
   // Okay, we know the src/dst types are both integers of differing types.
   // Handling growing first.
   assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
   if (SrcBitSize < DstBitSize) {
     unsigned NumInputsPerOutput = DstBitSize/SrcBitSize;
 
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0, e = BV->getNumOperands(); i != e;
          i += NumInputsPerOutput) {
       bool isLE = DAG.getDataLayout().isLittleEndian();
       APInt NewBits = APInt(DstBitSize, 0);
       bool EltIsUndef = true;
       for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
         // Shift the previously computed bits over.
         NewBits <<= SrcBitSize;
         SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
         if (Op.isUndef()) continue;
         EltIsUndef = false;
 
         NewBits |= cast<ConstantSDNode>(Op)->getAPIntValue().
                    zextOrTrunc(SrcBitSize).zext(DstBitSize);
       }
 
       if (EltIsUndef)
         Ops.push_back(DAG.getUNDEF(DstEltVT));
       else
         Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT));
     }
 
     EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
     return DAG.getBuildVector(VT, DL, Ops);
   }
 
   // Finally, this must be the case where we are shrinking elements: each input
   // turns into multiple outputs.
   unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
   EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
                             NumOutputsPerInput*BV->getNumOperands());
   SmallVector<SDValue, 8> Ops;
 
   for (const SDValue &Op : BV->op_values()) {
     if (Op.isUndef()) {
       Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT));
       continue;
     }
 
     APInt OpVal = cast<ConstantSDNode>(Op)->
                   getAPIntValue().zextOrTrunc(SrcBitSize);
 
     for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
       APInt ThisVal = OpVal.trunc(DstBitSize);
       Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT));
       OpVal.lshrInPlace(DstBitSize);
     }
 
     // For big endian targets, swap the order of the pieces of each element.
     if (DAG.getDataLayout().isBigEndian())
       std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
   }
 
   return DAG.getBuildVector(VT, DL, Ops);
 }
 
 static bool isContractable(SDNode *N) {
   SDNodeFlags F = N->getFlags();
   return F.hasAllowContract() || F.hasAllowReassociation();
 }
 
 /// Try to perform FMA combining on a given FADD node.
 SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc SL(N);
 
   const TargetOptions &Options = DAG.getTarget().Options;
 
   // Floating-point multiply-add with intermediate rounding.
   bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
       TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
   SDNodeFlags Flags = N->getFlags();
   bool CanFuse = Options.UnsafeFPMath || isContractable(N);
   bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
                               CanFuse || HasFMAD);
   // If the addition is not contractable, do not combine.
   if (!AllowFusionGlobally && !isContractable(N))
     return SDValue();
 
   const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
   if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
 
   // Is the node an FMUL and contractable either due to global flags or
   // SDNodeFlags.
   auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
     if (N.getOpcode() != ISD::FMUL)
       return false;
     return AllowFusionGlobally || isContractable(N.getNode());
   };
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
   if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
     if (N0.getNode()->use_size() > N1.getNode()->use_size())
       std::swap(N0, N1);
   }
 
   // fold (fadd (fmul x, y), z) -> (fma x, y, z)
   if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N0.getOperand(0), N0.getOperand(1), N1, Flags);
   }
 
   // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
   // Note: Commutes FADD operands.
   if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N1.getOperand(0), N1.getOperand(1), N0, Flags);
   }
 
   // Look through FP_EXTEND nodes to do more combining.
 
   // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
   if (N0.getOpcode() == ISD::FP_EXTEND) {
     SDValue N00 = N0.getOperand(0);
     if (isContractableFMUL(N00) &&
         TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N00.getOperand(0)),
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N00.getOperand(1)), N1, Flags);
     }
   }
 
   // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
   // Note: Commutes FADD operands.
   if (N1.getOpcode() == ISD::FP_EXTEND) {
     SDValue N10 = N1.getOperand(0);
     if (isContractableFMUL(N10) &&
         TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N10.getOperand(0)),
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N10.getOperand(1)), N0, Flags);
     }
   }
 
   // More folding opportunities when target permits.
   if (Aggressive) {
     // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
     if (CanFuse &&
         N0.getOpcode() == PreferredFusedOpcode &&
         N0.getOperand(2).getOpcode() == ISD::FMUL &&
         N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      N0.getOperand(2).getOperand(0),
                                      N0.getOperand(2).getOperand(1),
                                      N1, Flags), Flags);
     }
 
     // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
     if (CanFuse &&
         N1->getOpcode() == PreferredFusedOpcode &&
         N1.getOperand(2).getOpcode() == ISD::FMUL &&
         N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N1.getOperand(0), N1.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      N1.getOperand(2).getOperand(0),
                                      N1.getOperand(2).getOperand(1),
                                      N0, Flags), Flags);
     }
 
 
     // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
     //   -> (fma x, y, (fma (fpext u), (fpext v), z))
     auto FoldFAddFMAFPExtFMul = [&] (
       SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z,
       SDNodeFlags Flags) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
                                      Z, Flags), Flags);
     };
     if (N0.getOpcode() == PreferredFusedOpcode) {
       SDValue N02 = N0.getOperand(2);
       if (N02.getOpcode() == ISD::FP_EXTEND) {
         SDValue N020 = N02.getOperand(0);
         if (isContractableFMUL(N020) &&
             TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) {
           return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
                                       N020.getOperand(0), N020.getOperand(1),
                                       N1, Flags);
         }
       }
     }
 
     // fold (fadd (fpext (fma x, y, (fmul u, v))), z)
     //   -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
     // FIXME: This turns two single-precision and one double-precision
     // operation into two double-precision operations, which might not be
     // interesting for all targets, especially GPUs.
     auto FoldFAddFPExtFMAFMul = [&] (
       SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z,
       SDNodeFlags Flags) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
                          DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
                                      Z, Flags), Flags);
     };
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
       if (N00.getOpcode() == PreferredFusedOpcode) {
         SDValue N002 = N00.getOperand(2);
         if (isContractableFMUL(N002) &&
             TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
           return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
                                       N002.getOperand(0), N002.getOperand(1),
                                       N1, Flags);
         }
       }
     }
 
     // fold (fadd x, (fma y, z, (fpext (fmul u, v)))
     //   -> (fma y, z, (fma (fpext u), (fpext v), x))
     if (N1.getOpcode() == PreferredFusedOpcode) {
       SDValue N12 = N1.getOperand(2);
       if (N12.getOpcode() == ISD::FP_EXTEND) {
         SDValue N120 = N12.getOperand(0);
         if (isContractableFMUL(N120) &&
             TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) {
           return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
                                       N120.getOperand(0), N120.getOperand(1),
                                       N0, Flags);
         }
       }
     }
 
     // fold (fadd x, (fpext (fma y, z, (fmul u, v)))
     //   -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
     // FIXME: This turns two single-precision and one double-precision
     // operation into two double-precision operations, which might not be
     // interesting for all targets, especially GPUs.
     if (N1.getOpcode() == ISD::FP_EXTEND) {
       SDValue N10 = N1.getOperand(0);
       if (N10.getOpcode() == PreferredFusedOpcode) {
         SDValue N102 = N10.getOperand(2);
         if (isContractableFMUL(N102) &&
             TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
           return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
                                       N102.getOperand(0), N102.getOperand(1),
                                       N0, Flags);
         }
       }
     }
   }
 
   return SDValue();
 }
 
 /// Try to perform FMA combining on a given FSUB node.
 SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc SL(N);
 
   const TargetOptions &Options = DAG.getTarget().Options;
   // Floating-point multiply-add with intermediate rounding.
   bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
       TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
   const SDNodeFlags Flags = N->getFlags();
   bool CanFuse = Options.UnsafeFPMath || isContractable(N);
   bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
                               CanFuse || HasFMAD);
 
   // If the subtraction is not contractable, do not combine.
   if (!AllowFusionGlobally && !isContractable(N))
     return SDValue();
 
   const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
   if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
 
   // Is the node an FMUL and contractable either due to global flags or
   // SDNodeFlags.
   auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
     if (N.getOpcode() != ISD::FMUL)
       return false;
     return AllowFusionGlobally || isContractable(N.getNode());
   };
 
   // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
   if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N0.getOperand(0), N0.getOperand(1),
                        DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
   }
 
   // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
   // Note: Commutes FSUB operands.
   if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        DAG.getNode(ISD::FNEG, SL, VT,
                                    N1.getOperand(0)),
                        N1.getOperand(1), N0, Flags);
   }
 
   // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
   if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
       (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
     SDValue N00 = N0.getOperand(0).getOperand(0);
     SDValue N01 = N0.getOperand(0).getOperand(1);
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
                        DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
   }
 
   // Look through FP_EXTEND nodes to do more combining.
 
   // fold (fsub (fpext (fmul x, y)), z)
   //   -> (fma (fpext x), (fpext y), (fneg z))
   if (N0.getOpcode() == ISD::FP_EXTEND) {
     SDValue N00 = N0.getOperand(0);
     if (isContractableFMUL(N00) &&
         TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N00.getOperand(0)),
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N00.getOperand(1)),
                          DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
     }
   }
 
   // fold (fsub x, (fpext (fmul y, z)))
   //   -> (fma (fneg (fpext y)), (fpext z), x)
   // Note: Commutes FSUB operands.
   if (N1.getOpcode() == ISD::FP_EXTEND) {
     SDValue N10 = N1.getOperand(0);
     if (isContractableFMUL(N10) &&
         TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N10.getValueType())) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          DAG.getNode(ISD::FNEG, SL, VT,
                                      DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                  N10.getOperand(0))),
                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                      N10.getOperand(1)),
                          N0, Flags);
     }
   }
 
   // fold (fsub (fpext (fneg (fmul, x, y))), z)
   //   -> (fneg (fma (fpext x), (fpext y), z))
   // Note: This could be removed with appropriate canonicalization of the
   // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
   // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
   // from implementing the canonicalization in visitFSUB.
   if (N0.getOpcode() == ISD::FP_EXTEND) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getOpcode() == ISD::FNEG) {
       SDValue N000 = N00.getOperand(0);
       if (isContractableFMUL(N000) &&
           TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
         return DAG.getNode(ISD::FNEG, SL, VT,
                            DAG.getNode(PreferredFusedOpcode, SL, VT,
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N000.getOperand(0)),
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N000.getOperand(1)),
                                        N1, Flags));
       }
     }
   }
 
   // fold (fsub (fneg (fpext (fmul, x, y))), z)
   //   -> (fneg (fma (fpext x)), (fpext y), z)
   // Note: This could be removed with appropriate canonicalization of the
   // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
   // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
   // from implementing the canonicalization in visitFSUB.
   if (N0.getOpcode() == ISD::FNEG) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getOpcode() == ISD::FP_EXTEND) {
       SDValue N000 = N00.getOperand(0);
       if (isContractableFMUL(N000) &&
           TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N000.getValueType())) {
         return DAG.getNode(ISD::FNEG, SL, VT,
                            DAG.getNode(PreferredFusedOpcode, SL, VT,
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N000.getOperand(0)),
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N000.getOperand(1)),
                                        N1, Flags));
       }
     }
   }
 
   // More folding opportunities when target permits.
   if (Aggressive) {
     // fold (fsub (fma x, y, (fmul u, v)), z)
     //   -> (fma x, y (fma u, v, (fneg z)))
     if (CanFuse && N0.getOpcode() == PreferredFusedOpcode &&
         isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
         N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      N0.getOperand(2).getOperand(0),
                                      N0.getOperand(2).getOperand(1),
                                      DAG.getNode(ISD::FNEG, SL, VT,
                                                  N1), Flags), Flags);
     }
 
     // fold (fsub x, (fma y, z, (fmul u, v)))
     //   -> (fma (fneg y), z, (fma (fneg u), v, x))
     if (CanFuse && N1.getOpcode() == PreferredFusedOpcode &&
         isContractableFMUL(N1.getOperand(2))) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
       SDValue N21 = N1.getOperand(2).getOperand(1);
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          DAG.getNode(ISD::FNEG, SL, VT,
                                      N1.getOperand(0)),
                          N1.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
                                      DAG.getNode(ISD::FNEG, SL, VT, N20),
                                      N21, N0, Flags), Flags);
     }
 
 
     // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
     //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
     if (N0.getOpcode() == PreferredFusedOpcode) {
       SDValue N02 = N0.getOperand(2);
       if (N02.getOpcode() == ISD::FP_EXTEND) {
         SDValue N020 = N02.getOperand(0);
         if (isContractableFMUL(N020) &&
             TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N020.getValueType())) {
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
                              N0.getOperand(0), N0.getOperand(1),
                              DAG.getNode(PreferredFusedOpcode, SL, VT,
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                      N020.getOperand(0)),
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                      N020.getOperand(1)),
                                          DAG.getNode(ISD::FNEG, SL, VT,
                                                      N1), Flags), Flags);
         }
       }
     }
 
     // fold (fsub (fpext (fma x, y, (fmul u, v))), z)
     //   -> (fma (fpext x), (fpext y),
     //           (fma (fpext u), (fpext v), (fneg z)))
     // FIXME: This turns two single-precision and one double-precision
     // operation into two double-precision operations, which might not be
     // interesting for all targets, especially GPUs.
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
       if (N00.getOpcode() == PreferredFusedOpcode) {
         SDValue N002 = N00.getOperand(2);
         if (isContractableFMUL(N002) &&
             TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N00.getValueType())) {
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
                              DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                          N00.getOperand(0)),
                              DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                          N00.getOperand(1)),
                              DAG.getNode(PreferredFusedOpcode, SL, VT,
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                      N002.getOperand(0)),
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                      N002.getOperand(1)),
                                          DAG.getNode(ISD::FNEG, SL, VT,
                                                      N1), Flags), Flags);
         }
       }
     }
 
     // fold (fsub x, (fma y, z, (fpext (fmul u, v))))
     //   -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
     if (N1.getOpcode() == PreferredFusedOpcode &&
         N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) {
       SDValue N120 = N1.getOperand(2).getOperand(0);
       if (isContractableFMUL(N120) &&
           TLI.isFPExtFoldable(PreferredFusedOpcode, VT, N120.getValueType())) {
         SDValue N1200 = N120.getOperand(0);
         SDValue N1201 = N120.getOperand(1);
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
                            N1.getOperand(1),
                            DAG.getNode(PreferredFusedOpcode, SL, VT,
                                        DAG.getNode(ISD::FNEG, SL, VT,
                                                    DAG.getNode(ISD::FP_EXTEND, SL,
                                                                VT, N1200)),
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N1201),
                                        N0, Flags), Flags);
       }
     }
 
     // fold (fsub x, (fpext (fma y, z, (fmul u, v))))
     //   -> (fma (fneg (fpext y)), (fpext z),
     //           (fma (fneg (fpext u)), (fpext v), x))
     // FIXME: This turns two single-precision and one double-precision
     // operation into two double-precision operations, which might not be
     // interesting for all targets, especially GPUs.
     if (N1.getOpcode() == ISD::FP_EXTEND &&
         N1.getOperand(0).getOpcode() == PreferredFusedOpcode) {
       SDValue CvtSrc = N1.getOperand(0);
       SDValue N100 = CvtSrc.getOperand(0);
       SDValue N101 = CvtSrc.getOperand(1);
       SDValue N102 = CvtSrc.getOperand(2);
       if (isContractableFMUL(N102) &&
           TLI.isFPExtFoldable(PreferredFusedOpcode, VT, CvtSrc.getValueType())) {
         SDValue N1020 = N102.getOperand(0);
         SDValue N1021 = N102.getOperand(1);
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FNEG, SL, VT,
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N100)),
                            DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
                            DAG.getNode(PreferredFusedOpcode, SL, VT,
                                        DAG.getNode(ISD::FNEG, SL, VT,
                                                    DAG.getNode(ISD::FP_EXTEND, SL,
                                                                VT, N1020)),
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                                    N1021),
                                        N0, Flags), Flags);
       }
     }
   }
 
   return SDValue();
 }
 
 /// Try to perform FMA combining on a given FMUL node based on the distributive
 /// law x * (y + 1) = x * y + x and variants thereof (commuted versions,
 /// subtraction instead of addition).
 SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   SDLoc SL(N);
   const SDNodeFlags Flags = N->getFlags();
 
   assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
 
   const TargetOptions &Options = DAG.getTarget().Options;
 
   // The transforms below are incorrect when x == 0 and y == inf, because the
   // intermediate multiplication produces a nan.
   if (!Options.NoInfsFPMath)
     return SDValue();
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
       (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
       TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // Floating-point multiply-add with intermediate rounding. This can result
   // in a less precise result due to the changed rounding order.
   bool HasFMAD = Options.UnsafeFPMath &&
                  (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
 
   // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y)
   // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y))
   auto FuseFADD = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
     if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
       auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
       if (XC1 && XC1->isExactlyValue(+1.0))
         return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
                            Y, Flags);
       if (XC1 && XC1->isExactlyValue(-1.0))
         return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
                            DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
     }
     return SDValue();
   };
 
   if (SDValue FMA = FuseFADD(N0, N1, Flags))
     return FMA;
   if (SDValue FMA = FuseFADD(N1, N0, Flags))
     return FMA;
 
   // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y)
   // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y))
   // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y))
   // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y)
   auto FuseFSUB = [&](SDValue X, SDValue Y, const SDNodeFlags Flags) {
     if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
       auto XC0 = isConstOrConstSplatFP(X.getOperand(0));
       if (XC0 && XC0->isExactlyValue(+1.0))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
                            Y, Flags);
       if (XC0 && XC0->isExactlyValue(-1.0))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
                            DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
 
       auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
       if (XC1 && XC1->isExactlyValue(+1.0))
         return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
                            DAG.getNode(ISD::FNEG, SL, VT, Y), Flags);
       if (XC1 && XC1->isExactlyValue(-1.0))
         return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
                            Y, Flags);
     }
     return SDValue();
   };
 
   if (SDValue FMA = FuseFSUB(N0, N1, Flags))
     return FMA;
   if (SDValue FMA = FuseFSUB(N1, N0, Flags))
     return FMA;
 
   return SDValue();
 }
 
 static bool isFMulNegTwo(SDValue &N) {
   if (N.getOpcode() != ISD::FMUL)
     return false;
   if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N.getOperand(1)))
     return CFP->isExactlyValue(-2.0);
   return false;
 }
 
 SDValue DAGCombiner::visitFADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
   bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
   const SDNodeFlags Flags = N->getFlags();
 
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
   // fold (fadd c1, c2) -> c1 + c2
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags);
 
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
       isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2)
     return DAG.getNode(ISD::FSUB, DL, VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations), Flags);
 
   // fold (fadd (fneg A), B) -> (fsub B, A)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
       isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2)
     return DAG.getNode(ISD::FSUB, DL, VT, N1,
                        GetNegatedExpression(N0, DAG, LegalOperations), Flags);
 
   // fold (fadd A, (fmul B, -2.0)) -> (fsub A, (fadd B, B))
   // fold (fadd (fmul B, -2.0), A) -> (fsub A, (fadd B, B))
   if ((isFMulNegTwo(N0) && N0.hasOneUse()) ||
       (isFMulNegTwo(N1) && N1.hasOneUse())) {
     bool N1IsFMul = isFMulNegTwo(N1);
     SDValue AddOp = N1IsFMul ? N1.getOperand(0) : N0.getOperand(0);
     SDValue Add = DAG.getNode(ISD::FADD, DL, VT, AddOp, AddOp, Flags);
     return DAG.getNode(ISD::FSUB, DL, VT, N1IsFMul ? N0 : N1, Add, Flags);
   }
 
   ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1);
   if (N1C && N1C->isZero()) {
     if (N1C->isNegative() || Options.UnsafeFPMath ||
         Flags.hasNoSignedZeros()) {
       // fold (fadd A, 0) -> A
       return N0;
     }
   }
 
   // No FP constant should be created after legalization as Instruction
   // Selection pass has a hard time dealing with FP constants.
   bool AllowNewConst = (Level < AfterLegalizeDAG);
 
   // If 'unsafe math' or nnan is enabled, fold lots of things.
   if ((Options.UnsafeFPMath || Flags.hasNoNaNs()) && AllowNewConst) {
     // If allowed, fold (fadd (fneg x), x) -> 0.0
     if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
       return DAG.getConstantFP(0.0, DL, VT);
 
     // If allowed, fold (fadd x, (fneg x)) -> 0.0
     if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
       return DAG.getConstantFP(0.0, DL, VT);
   }
 
   // If 'unsafe math' or reassoc and nsz, fold lots of things.
   // TODO: break out portions of the transformations below for which Unsafe is
   //       considered and which do not require both nsz and reassoc
   if ((Options.UnsafeFPMath ||
        (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
       AllowNewConst) {
     // fadd (fadd x, c1), c2 -> fadd x, c1 + c2
     if (N1CFP && N0.getOpcode() == ISD::FADD &&
         isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
       SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1, Flags);
       return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC, Flags);
     }
 
     // We can fold chains of FADD's of the same value into multiplications.
     // This transform is not safe in general because we are reducing the number
     // of rounding steps.
     if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
       if (N0.getOpcode() == ISD::FMUL) {
         bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
         bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
 
         // (fadd (fmul x, c), x) -> (fmul x, c+1)
         if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
           SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
                                        DAG.getConstantFP(1.0, DL, VT), Flags);
           return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags);
         }
 
         // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
         if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
             N1.getOperand(0) == N1.getOperand(1) &&
             N0.getOperand(0) == N1.getOperand(0)) {
           SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
                                        DAG.getConstantFP(2.0, DL, VT), Flags);
           return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags);
         }
       }
 
       if (N1.getOpcode() == ISD::FMUL) {
         bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
         bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
 
         // (fadd x, (fmul x, c)) -> (fmul x, c+1)
         if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
           SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
                                        DAG.getConstantFP(1.0, DL, VT), Flags);
           return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags);
         }
 
         // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
         if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
             N0.getOperand(0) == N0.getOperand(1) &&
             N1.getOperand(0) == N0.getOperand(0)) {
           SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
                                        DAG.getConstantFP(2.0, DL, VT), Flags);
           return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags);
         }
       }
 
       if (N0.getOpcode() == ISD::FADD) {
         bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
         // (fadd (fadd x, x), x) -> (fmul x, 3.0)
         if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
             (N0.getOperand(0) == N1)) {
           return DAG.getNode(ISD::FMUL, DL, VT,
                              N1, DAG.getConstantFP(3.0, DL, VT), Flags);
         }
       }
 
       if (N1.getOpcode() == ISD::FADD) {
         bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
         // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
         if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
             N1.getOperand(0) == N0) {
           return DAG.getNode(ISD::FMUL, DL, VT,
                              N0, DAG.getConstantFP(3.0, DL, VT), Flags);
         }
       }
 
       // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
       if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
           N0.getOperand(0) == N0.getOperand(1) &&
           N1.getOperand(0) == N1.getOperand(1) &&
           N0.getOperand(0) == N1.getOperand(0)) {
         return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
                            DAG.getConstantFP(4.0, DL, VT), Flags);
       }
     }
   } // enable-unsafe-fp-math
 
   // FADD -> FMA combines:
   if (SDValue Fused = visitFADDForFMACombine(N)) {
     AddToWorklist(Fused.getNode());
     return Fused;
   }
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFSUB(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
   ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
   const SDNodeFlags Flags = N->getFlags();
 
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
   // fold (fsub c1, c2) -> c1-c2
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   // (fsub A, 0) -> A
   if (N1CFP && N1CFP->isZero()) {
     if (!N1CFP->isNegative() || Options.UnsafeFPMath ||
         Flags.hasNoSignedZeros()) {
       return N0;
     }
   }
 
   if (N0 == N1) {
     // (fsub x, x) -> 0.0
     if (Options.UnsafeFPMath || Flags.hasNoNaNs())
       return DAG.getConstantFP(0.0f, DL, VT);
   }
 
   // (fsub 0, B) -> -B
   if (N0CFP && N0CFP->isZero()) {
     if (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) {
       if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
         return GetNegatedExpression(N1, DAG, LegalOperations);
       if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
         return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags);
     }
   }
 
   // fold (fsub A, (fneg B)) -> (fadd A, B)
   if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
     return DAG.getNode(ISD::FADD, DL, VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations), Flags);
 
   // If 'unsafe math' is enabled, fold lots of things.
   if (Options.UnsafeFPMath) {
     // (fsub x, (fadd x, y)) -> (fneg y)
     // (fsub x, (fadd y, x)) -> (fneg y)
     if (N1.getOpcode() == ISD::FADD) {
       SDValue N10 = N1->getOperand(0);
       SDValue N11 = N1->getOperand(1);
 
       if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI, &Options))
         return GetNegatedExpression(N11, DAG, LegalOperations);
 
       if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI, &Options))
         return GetNegatedExpression(N10, DAG, LegalOperations);
     }
   }
 
   // FSUB -> FMA combines:
   if (SDValue Fused = visitFSUBForFMACombine(N)) {
     AddToWorklist(Fused.getNode());
     return Fused;
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
   ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
   const SDNodeFlags Flags = N->getFlags();
 
   // fold vector ops
   if (VT.isVector()) {
     // This just handles C1 * C2 for vectors. Other vector folds are below.
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
   }
 
   // fold (fmul c1, c2) -> c1*c2
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags);
 
   // canonicalize constant to RHS
   if (isConstantFPBuildVectorOrConstantFP(N0) &&
      !isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags);
 
   // fold (fmul A, 1.0) -> A
   if (N1CFP && N1CFP->isExactlyValue(1.0))
     return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   if (Options.UnsafeFPMath ||
       (Flags.hasNoNaNs() && Flags.hasNoSignedZeros())) {
     // fold (fmul A, 0) -> 0
     if (N1CFP && N1CFP->isZero())
       return N1;
   }
 
   if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) {
     // fmul (fmul X, C1), C2 -> fmul X, C1 * C2
     if (N0.getOpcode() == ISD::FMUL) {
       // Fold scalars or any vector constants (not just splats).
       // This fold is done in general by InstCombine, but extra fmul insts
       // may have been generated during lowering.
       SDValue N00 = N0.getOperand(0);
       SDValue N01 = N0.getOperand(1);
       auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
       auto *BV00 = dyn_cast<BuildVectorSDNode>(N00);
       auto *BV01 = dyn_cast<BuildVectorSDNode>(N01);
 
       // Check 1: Make sure that the first operand of the inner multiply is NOT
       // a constant. Otherwise, we may induce infinite looping.
       if (!(isConstOrConstSplatFP(N00) || (BV00 && BV00->isConstant()))) {
         // Check 2: Make sure that the second operand of the inner multiply and
         // the second operand of the outer multiply are constants.
         if ((N1CFP && isConstOrConstSplatFP(N01)) ||
             (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) {
           SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags);
           return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags);
         }
       }
     }
 
     // Match a special-case: we convert X * 2.0 into fadd.
     // fmul (fadd X, X), C -> fmul X, 2.0 * C
     if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() &&
         N0.getOperand(0) == N0.getOperand(1)) {
       const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
       SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags);
       return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags);
     }
   }
 
   // fold (fmul X, 2.0) -> (fadd X, X)
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
     return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags);
 
   // fold (fmul X, -1.0) -> (fneg X)
   if (N1CFP && N1CFP->isExactlyValue(-1.0))
     if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
       return DAG.getNode(ISD::FNEG, DL, VT, N0);
 
   // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
   if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
     if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
       // Both can be negated for free, check to see if at least one is cheaper
       // negated.
       if (LHSNeg == 2 || RHSNeg == 2)
         return DAG.getNode(ISD::FMUL, DL, VT,
                            GetNegatedExpression(N0, DAG, LegalOperations),
                            GetNegatedExpression(N1, DAG, LegalOperations),
                            Flags);
     }
   }
 
   // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
   // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
   if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() &&
       (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) &&
       TLI.isOperationLegal(ISD::FABS, VT)) {
     SDValue Select = N0, X = N1;
     if (Select.getOpcode() != ISD::SELECT)
       std::swap(Select, X);
 
     SDValue Cond = Select.getOperand(0);
     auto TrueOpnd  = dyn_cast<ConstantFPSDNode>(Select.getOperand(1));
     auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2));
 
     if (TrueOpnd && FalseOpnd &&
         Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X &&
         isa<ConstantFPSDNode>(Cond.getOperand(1)) &&
         cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) {
       ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
       switch (CC) {
       default: break;
       case ISD::SETOLT:
       case ISD::SETULT:
       case ISD::SETOLE:
       case ISD::SETULE:
       case ISD::SETLT:
       case ISD::SETLE:
         std::swap(TrueOpnd, FalseOpnd);
         LLVM_FALLTHROUGH;
       case ISD::SETOGT:
       case ISD::SETUGT:
       case ISD::SETOGE:
       case ISD::SETUGE:
       case ISD::SETGT:
       case ISD::SETGE:
         if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) &&
             TLI.isOperationLegal(ISD::FNEG, VT))
           return DAG.getNode(ISD::FNEG, DL, VT,
                    DAG.getNode(ISD::FABS, DL, VT, X));
         if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0))
           return DAG.getNode(ISD::FABS, DL, VT, X);
 
         break;
       }
     }
   }
 
   // FMUL -> FMA combines:
   if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) {
     AddToWorklist(Fused.getNode());
     return Fused;
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFMA(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
 
   // FMA nodes have flags that propagate to the created nodes.
   const SDNodeFlags Flags = N->getFlags();
   bool UnsafeFPMath = Options.UnsafeFPMath || isContractable(N);
 
   // Constant fold FMA.
   if (isa<ConstantFPSDNode>(N0) &&
       isa<ConstantFPSDNode>(N1) &&
       isa<ConstantFPSDNode>(N2)) {
     return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2);
   }
 
   if (UnsafeFPMath) {
     if (N0CFP && N0CFP->isZero())
       return N2;
     if (N1CFP && N1CFP->isZero())
       return N2;
   }
   // TODO: The FMA node should have flags that propagate to these nodes.
   if (N0CFP && N0CFP->isExactlyValue(1.0))
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
   if (N1CFP && N1CFP->isExactlyValue(1.0))
     return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);
 
   // Canonicalize (fma c, x, y) -> (fma x, c, y)
   if (isConstantFPBuildVectorOrConstantFP(N0) &&
      !isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
 
   if (UnsafeFPMath) {
     // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
     if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
         isConstantFPBuildVectorOrConstantFP(N1) &&
         isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
       return DAG.getNode(ISD::FMUL, DL, VT, N0,
                          DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1),
                                      Flags), Flags);
     }
 
     // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
     if (N0.getOpcode() == ISD::FMUL &&
         isConstantFPBuildVectorOrConstantFP(N1) &&
         isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
       return DAG.getNode(ISD::FMA, DL, VT,
                          N0.getOperand(0),
                          DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1),
                                      Flags),
                          N2);
     }
   }
 
   // (fma x, 1, y) -> (fadd x, y)
   // (fma x, -1, y) -> (fadd (fneg x), y)
   if (N1CFP) {
     if (N1CFP->isExactlyValue(1.0))
       // TODO: The FMA node should have flags that propagate to this node.
       return DAG.getNode(ISD::FADD, DL, VT, N0, N2);
 
     if (N1CFP->isExactlyValue(-1.0) &&
         (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
       SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0);
       AddToWorklist(RHSNeg.getNode());
       // TODO: The FMA node should have flags that propagate to this node.
       return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
     }
 
     // fma (fneg x), K, y -> fma x -K, y
     if (N0.getOpcode() == ISD::FNEG &&
         (TLI.isOperationLegal(ISD::ConstantFP, VT) ||
          (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT)))) {
       return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
                          DAG.getNode(ISD::FNEG, DL, VT, N1, Flags), N2);
     }
   }
 
   if (UnsafeFPMath) {
     // (fma x, c, x) -> (fmul x, (c+1))
     if (N1CFP && N0 == N2) {
       return DAG.getNode(ISD::FMUL, DL, VT, N0,
                          DAG.getNode(ISD::FADD, DL, VT, N1,
                                      DAG.getConstantFP(1.0, DL, VT), Flags),
                          Flags);
     }
 
     // (fma x, c, (fneg x)) -> (fmul x, (c-1))
     if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
       return DAG.getNode(ISD::FMUL, DL, VT, N0,
                          DAG.getNode(ISD::FADD, DL, VT, N1,
                                      DAG.getConstantFP(-1.0, DL, VT), Flags),
                          Flags);
     }
   }
 
   return SDValue();
 }
 
 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
 // reciprocal.
 // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
 // Notice that this is not always beneficial. One reason is different targets
 // may have different costs for FDIV and FMUL, so sometimes the cost of two
 // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
 // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
 SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
   bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
   const SDNodeFlags Flags = N->getFlags();
   if (!UnsafeMath && !Flags.hasAllowReciprocal())
     return SDValue();
 
   // Skip if current node is a reciprocal.
   SDValue N0 = N->getOperand(0);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   if (N0CFP && N0CFP->isExactlyValue(1.0))
     return SDValue();
 
   // Exit early if the target does not want this transform or if there can't
   // possibly be enough uses of the divisor to make the transform worthwhile.
   SDValue N1 = N->getOperand(1);
   unsigned MinUses = TLI.combineRepeatedFPDivisors();
   if (!MinUses || N1->use_size() < MinUses)
     return SDValue();
 
   // Find all FDIV users of the same divisor.
   // Use a set because duplicates may be present in the user list.
   SetVector<SDNode *> Users;
   for (auto *U : N1->uses()) {
     if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
       // This division is eligible for optimization only if global unsafe math
       // is enabled or if this division allows reciprocal formation.
       if (UnsafeMath || U->getFlags().hasAllowReciprocal())
         Users.insert(U);
     }
   }
 
   // Now that we have the actual number of divisor uses, make sure it meets
   // the minimum threshold specified by the target.
   if (Users.size() < MinUses)
     return SDValue();
 
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
   SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags);
 
   // Dividend / Divisor -> Dividend * Reciprocal
   for (auto *U : Users) {
     SDValue Dividend = U->getOperand(0);
     if (Dividend != FPOne) {
       SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
                                     Reciprocal, Flags);
       CombineTo(U, NewNode);
     } else if (U != Reciprocal.getNode()) {
       // In the absence of fast-math-flags, this user node is always the
       // same node as Reciprocal, but with FMF they may be different nodes.
       CombineTo(U, Reciprocal);
     }
   }
   return SDValue(N, 0);  // N was replaced.
 }
 
 SDValue DAGCombiner::visitFDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   const TargetOptions &Options = DAG.getTarget().Options;
   SDNodeFlags Flags = N->getFlags();
 
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
   // fold (fdiv c1, c2) -> c1/c2
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
     // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
     if (N1CFP) {
       // Compute the reciprocal 1.0 / c2.
       const APFloat &N1APF = N1CFP->getValueAPF();
       APFloat Recip(N1APF.getSemantics(), 1); // 1.0
       APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
       // Only do the transform if the reciprocal is a legal fp immediate that
       // isn't too nasty (eg NaN, denormal, ...).
       if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
           (!LegalOperations ||
            // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
            // backend)... we should handle this gracefully after Legalize.
            // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
            TLI.isOperationLegal(ISD::ConstantFP, VT) ||
            TLI.isFPImmLegal(Recip, VT)))
         return DAG.getNode(ISD::FMUL, DL, VT, N0,
                            DAG.getConstantFP(Recip, DL, VT), Flags);
     }
 
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {
       if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) {
         return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
       }
     } else if (N1.getOpcode() == ISD::FP_EXTEND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
       if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
                                           Flags)) {
         RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
         AddToWorklist(RV.getNode());
         return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
       }
     } else if (N1.getOpcode() == ISD::FP_ROUND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
       if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
                                           Flags)) {
         RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
         AddToWorklist(RV.getNode());
         return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
       }
     } else if (N1.getOpcode() == ISD::FMUL) {
       // Look through an FMUL. Even though this won't remove the FDIV directly,
       // it's still worthwhile to get rid of the FSQRT if possible.
       SDValue SqrtOp;
       SDValue OtherOp;
       if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
         SqrtOp = N1.getOperand(0);
         OtherOp = N1.getOperand(1);
       } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
         SqrtOp = N1.getOperand(1);
         OtherOp = N1.getOperand(0);
       }
       if (SqrtOp.getNode()) {
         // We found a FSQRT, so try to make this fold:
         // x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
         if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {
           RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags);
           AddToWorklist(RV.getNode());
           return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
         }
       }
     }
 
     // Fold into a reciprocal estimate and multiply instead of a real divide.
     if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) {
       AddToWorklist(RV.getNode());
       return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
     }
   }
 
   // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
   if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
     if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
       // Both can be negated for free, check to see if at least one is cheaper
       // negated.
       if (LHSNeg == 2 || RHSNeg == 2)
         return DAG.getNode(ISD::FDIV, SDLoc(N), VT,
                            GetNegatedExpression(N0, DAG, LegalOperations),
                            GetNegatedExpression(N1, DAG, LegalOperations),
                            Flags);
     }
   }
 
   if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
     return CombineRepeatedDivisors;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFREM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
 
   // fold (frem c1, c2) -> fmod(c1,c2)
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1, N->getFlags());
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
   SDNodeFlags Flags = N->getFlags();
   if (!DAG.getTarget().Options.UnsafeFPMath &&
       !Flags.hasApproximateFuncs())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   if (TLI.isFsqrtCheap(N0, DAG))
     return SDValue();
 
   // FSQRT nodes have flags that propagate to the created nodes.
   return buildSqrtEstimate(N0, Flags);
 }
 
 /// copysign(x, fp_extend(y)) -> copysign(x, y)
 /// copysign(x, fp_round(y)) -> copysign(x, y)
 static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   if ((N1.getOpcode() == ISD::FP_EXTEND ||
        N1.getOpcode() == ISD::FP_ROUND)) {
     // Do not optimize out type conversion of f128 type yet.
     // For some targets like x86_64, configuration is changed to keep one f128
     // value in one SSE register, but instruction selection cannot handle
     // FCOPYSIGN on SSE registers yet.
     EVT N1VT = N1->getValueType(0);
     EVT N1Op0VT = N1->getOperand(0).getValueType();
     return (N1VT == N1Op0VT || N1Op0VT != MVT::f128);
   }
   return false;
 }
 
 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
 
   if (N0CFP && N1CFP) // Constant fold
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1);
 
   if (N1CFP) {
     const APFloat &V = N1CFP->getValueAPF();
     // copysign(x, c1) -> fabs(x)       iff ispos(c1)
     // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
     if (!V.isNegative()) {
       if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
         return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
     } else {
       if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
         return DAG.getNode(ISD::FNEG, SDLoc(N), VT,
                            DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
     }
   }
 
   // copysign(fabs(x), y) -> copysign(x, y)
   // copysign(fneg(x), y) -> copysign(x, y)
   // copysign(copysign(x,z), y) -> copysign(x, y)
   if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
       N0.getOpcode() == ISD::FCOPYSIGN)
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1);
 
   // copysign(x, abs(y)) -> abs(x)
   if (N1.getOpcode() == ISD::FABS)
     return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
 
   // copysign(x, copysign(y,z)) -> copysign(x, z)
   if (N1.getOpcode() == ISD::FCOPYSIGN)
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1));
 
   // copysign(x, fp_extend(y)) -> copysign(x, y)
   // copysign(x, fp_round(y)) -> copysign(x, y)
   if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));
 
   return SDValue();
 }
 
 static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG,
                                const TargetLowering &TLI) {
   // This optimization is guarded by a function attribute because it may produce
   // unexpected results. Ie, programs may be relying on the platform-specific
   // undefined behavior when the float-to-int conversion overflows.
   const Function &F = DAG.getMachineFunction().getFunction();
   Attribute StrictOverflow = F.getFnAttribute("strict-float-cast-overflow");
   if (StrictOverflow.getValueAsString().equals("false"))
     return SDValue();
 
   // We only do this if the target has legal ftrunc. Otherwise, we'd likely be
   // replacing casts with a libcall. We also must be allowed to ignore -0.0
   // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
   // conversions would return +0.0.
   // FIXME: We should be able to use node-level FMF here.
   // TODO: If strict math, should we use FABS (+ range check for signed cast)?
   EVT VT = N->getValueType(0);
   if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
       !DAG.getTarget().Options.NoSignedZerosFPMath)
     return SDValue();
 
   // fptosi/fptoui round towards zero, so converting from FP to integer and
   // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
   SDValue N0 = N->getOperand(0);
   if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
       N0.getOperand(0).getValueType() == VT)
     return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
 
   if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
       N0.getOperand(0).getValueType() == VT)
     return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
 
   // fold (sint_to_fp c1) -> c1fp
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
     return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
 
   // If the input is a legal type, and SINT_TO_FP is not legal on this target,
   // but UINT_TO_FP is legal on this target, try to convert.
   if (!TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT) &&
       TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) {
     // If the sign bit is known to be zero, we can change this to UINT_TO_FP.
     if (DAG.SignBitIsZero(N0))
       return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
   }
 
   // The next optimizations are desirable only if SELECT_CC can be lowered.
   if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
     // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
     if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
         !VT.isVector() &&
         (!LegalOperations ||
          TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
       SDLoc DL(N);
       SDValue Ops[] =
         { N0.getOperand(0), N0.getOperand(1),
           DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
           N0.getOperand(2) };
       return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
     }
 
     // fold (sint_to_fp (zext (setcc x, y, cc))) ->
     //      (select_cc x, y, 1.0, 0.0,, cc)
     if (N0.getOpcode() == ISD::ZERO_EXTEND &&
         N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&
         (!LegalOperations ||
          TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
       SDLoc DL(N);
       SDValue Ops[] =
         { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
           DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
           N0.getOperand(0).getOperand(2) };
       return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
     }
   }
 
   if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
     return FTrunc;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
 
   // fold (uint_to_fp c1) -> c1fp
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
     return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
 
   // If the input is a legal type, and UINT_TO_FP is not legal on this target,
   // but SINT_TO_FP is legal on this target, try to convert.
   if (!TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT) &&
       TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) {
     // If the sign bit is known to be zero, we can change this to SINT_TO_FP.
     if (DAG.SignBitIsZero(N0))
       return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
   }
 
   // The next optimizations are desirable only if SELECT_CC can be lowered.
   if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
     // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
     if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
         (!LegalOperations ||
          TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
       SDLoc DL(N);
       SDValue Ops[] =
         { N0.getOperand(0), N0.getOperand(1),
           DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
           N0.getOperand(2) };
       return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
     }
   }
 
   if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
     return FTrunc;
 
   return SDValue();
 }
 
 // Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
 static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
     return SDValue();
 
   SDValue Src = N0.getOperand(0);
   EVT SrcVT = Src.getValueType();
   bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
   bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;
 
   // We can safely assume the conversion won't overflow the output range,
   // because (for example) (uint8_t)18293.f is undefined behavior.
 
   // Since we can assume the conversion won't overflow, our decision as to
   // whether the input will fit in the float should depend on the minimum
   // of the input range and output range.
 
   // This means this is also safe for a signed input and unsigned output, since
   // a negative input would lead to undefined behavior.
   unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
   unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
   unsigned ActualSize = std::min(InputSize, OutputSize);
   const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
 
   // We can only fold away the float conversion if the input range can be
   // represented exactly in the float range.
   if (APFloat::semanticsPrecision(sem) >= ActualSize) {
     if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
       unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
                                                        : ISD::ZERO_EXTEND;
       return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
     }
     if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
     return DAG.getBitcast(VT, Src);
   }
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (fp_to_sint c1fp) -> c1
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
 
   return FoldIntToFPToInt(N, DAG);
 }
 
 SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (fp_to_uint c1fp) -> c1
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
 
   return FoldIntToFPToInt(N, DAG);
 }
 
 SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   EVT VT = N->getValueType(0);
 
   // fold (fp_round c1fp) -> c1fp
   if (N0CFP)
     return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1);
 
   // fold (fp_round (fp_extend x)) -> x
   if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
     return N0.getOperand(0);
 
   // fold (fp_round (fp_round x)) -> (fp_round x)
   if (N0.getOpcode() == ISD::FP_ROUND) {
     const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
     const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1;
 
     // Skip this folding if it results in an fp_round from f80 to f16.
     //
     // f80 to f16 always generates an expensive (and as yet, unimplemented)
     // libcall to __truncxfhf2 instead of selecting native f16 conversion
     // instructions from f32 or f64.  Moreover, the first (value-preserving)
     // fp_round from f80 to either f32 or f64 may become a NOP in platforms like
     // x86.
     if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16)
       return SDValue();
 
     // If the first fp_round isn't a value preserving truncation, it might
     // introduce a tie in the second fp_round, that wouldn't occur in the
     // single-step fp_round we want to fold to.
     // In other words, double rounding isn't the same as rounding.
     // Also, this is a value preserving truncation iff both fp_round's are.
     if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) {
       SDLoc DL(N);
       return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0),
                          DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL));
     }
   }
 
   // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
   if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
     SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
                               N0.getOperand(0), N1);
     AddToWorklist(Tmp.getNode());
     return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
                        Tmp, N0.getOperand(1));
   }
 
   if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
     return NewVSel;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
 
   // fold (fp_round_inreg c1fp) -> c1fp
   if (N0CFP && isTypeLegal(EVT)) {
     SDLoc DL(N);
     SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), DL, EVT);
     return DAG.getNode(ISD::FP_EXTEND, DL, VT, Round);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
   if (N->hasOneUse() &&
       N->use_begin()->getOpcode() == ISD::FP_ROUND)
     return SDValue();
 
   // fold (fp_extend c1fp) -> c1fp
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);
 
   // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op)
   if (N0.getOpcode() == ISD::FP16_TO_FP &&
       TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal)
     return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0));
 
   // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
   // value of X.
   if (N0.getOpcode() == ISD::FP_ROUND
       && N0.getConstantOperandVal(1) == 1) {
     SDValue In = N0.getOperand(0);
     if (In.getValueType() == VT) return In;
     if (VT.bitsLT(In.getValueType()))
       return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
                          In, N0.getOperand(1));
     return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
   }
 
   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
        TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
                                      LN0->getBasePtr(), N0.getValueType(),
                                      LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(),
               DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
                           N0.getValueType(), ExtLoad,
                           DAG.getIntPtrConstant(1, SDLoc(N0))),
               ExtLoad.getValue(1));
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
   }
 
   if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
     return NewVSel;
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFCEIL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (fceil c1) -> fceil(c1)
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (ftrunc c1) -> ftrunc(c1)
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
 
   // fold ftrunc (known rounded int x) -> x
   // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is
   // likely to be generated to extract integer from a rounded floating value.
   switch (N0.getOpcode()) {
   default: break;
   case ISD::FRINT:
   case ISD::FTRUNC:
   case ISD::FNEARBYINT:
   case ISD::FFLOOR:
   case ISD::FCEIL:
     return N0;
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (ffloor c1) -> ffloor(c1)
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
 
   return SDValue();
 }
 
 // FIXME: FNEG and FABS have a lot in common; refactor.
 SDValue DAGCombiner::visitFNEG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // Constant fold FNEG.
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
 
   if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
                          &DAG.getTarget().Options))
     return GetNegatedExpression(N0, DAG, LegalOperations);
 
   // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
   // constant pool values.
   if (!TLI.isFNegFree(VT) &&
       N0.getOpcode() == ISD::BITCAST &&
       N0.getNode()->hasOneUse()) {
     SDValue Int = N0.getOperand(0);
     EVT IntVT = Int.getValueType();
     if (IntVT.isInteger() && !IntVT.isVector()) {
       APInt SignMask;
       if (N0.getValueType().isVector()) {
         // For a vector, get a mask such as 0x80... per scalar element
         // and splat it.
         SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
         SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
       } else {
         // For a scalar, just generate 0x80...
         SignMask = APInt::getSignMask(IntVT.getSizeInBits());
       }
       SDLoc DL0(N0);
       Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
                         DAG.getConstant(SignMask, DL0, IntVT));
       AddToWorklist(Int.getNode());
       return DAG.getBitcast(VT, Int);
     }
   }
 
   // (fneg (fmul c, x)) -> (fmul -c, x)
   if (N0.getOpcode() == ISD::FMUL &&
       (N0.getNode()->hasOneUse() || !TLI.isFNegFree(VT))) {
     ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
     if (CFP1) {
       APFloat CVal = CFP1->getValueAPF();
       CVal.changeSign();
       if (Level >= AfterLegalizeDAG &&
           (TLI.isFPImmLegal(CVal, VT) ||
            TLI.isOperationLegal(ISD::ConstantFP, VT)))
         return DAG.getNode(
             ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
             DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0.getOperand(1)),
             N0->getFlags());
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
   const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
 
   if (N0CFP && N1CFP) {
     const APFloat &C0 = N0CFP->getValueAPF();
     const APFloat &C1 = N1CFP->getValueAPF();
     return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), VT);
   }
 
   // Canonicalize to constant on RHS.
   if (isConstantFPBuildVectorOrConstantFP(N0) &&
      !isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
   const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
 
   if (N0CFP && N1CFP) {
     const APFloat &C0 = N0CFP->getValueAPF();
     const APFloat &C1 = N1CFP->getValueAPF();
     return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), VT);
   }
 
   // Canonicalize to constant on RHS.
   if (isConstantFPBuildVectorOrConstantFP(N0) &&
      !isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFABS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // fold (fabs c1) -> fabs(c1)
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
 
   // fold (fabs (fabs x)) -> (fabs x)
   if (N0.getOpcode() == ISD::FABS)
     return N->getOperand(0);
 
   // fold (fabs (fneg x)) -> (fabs x)
   // fold (fabs (fcopysign x, y)) -> (fabs x)
   if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
     return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
 
   // Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading
   // constant pool values.
   if (!TLI.isFAbsFree(VT) &&
       N0.getOpcode() == ISD::BITCAST &&
       N0.getNode()->hasOneUse()) {
     SDValue Int = N0.getOperand(0);
     EVT IntVT = Int.getValueType();
     if (IntVT.isInteger() && !IntVT.isVector()) {
       APInt SignMask;
       if (N0.getValueType().isVector()) {
         // For a vector, get a mask such as 0x7f... per scalar element
         // and splat it.
         SignMask = ~APInt::getSignMask(N0.getScalarValueSizeInBits());
         SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
       } else {
         // For a scalar, just generate 0x7f...
         SignMask = ~APInt::getSignMask(IntVT.getSizeInBits());
       }
       SDLoc DL(N0);
       Int = DAG.getNode(ISD::AND, DL, IntVT, Int,
                         DAG.getConstant(SignMask, DL, IntVT));
       AddToWorklist(Int.getNode());
       return DAG.getBitcast(N->getValueType(0), Int);
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitBRCOND(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
 
   // If N is a constant we could fold this into a fallthrough or unconditional
   // branch. However that doesn't happen very often in normal code, because
   // Instcombine/SimplifyCFG should have handled the available opportunities.
   // If we did this folding here, it would be necessary to update the
   // MachineBasicBlock CFG, which is awkward.
 
   // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
   // on the target.
   if (N1.getOpcode() == ISD::SETCC &&
       TLI.isOperationLegalOrCustom(ISD::BR_CC,
                                    N1.getOperand(0).getValueType())) {
     return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
                        Chain, N1.getOperand(2),
                        N1.getOperand(0), N1.getOperand(1), N2);
   }
 
   if (N1.hasOneUse()) {
     if (SDValue NewN1 = rebuildSetCC(N1))
       return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, NewN1, N2);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::rebuildSetCC(SDValue N) {
   if (N.getOpcode() == ISD::SRL ||
       (N.getOpcode() == ISD::TRUNCATE &&
        (N.getOperand(0).hasOneUse() &&
         N.getOperand(0).getOpcode() == ISD::SRL))) {
     // Look pass the truncate.
     if (N.getOpcode() == ISD::TRUNCATE)
       N = N.getOperand(0);
 
     // Match this pattern so that we can generate simpler code:
     //
     //   %a = ...
     //   %b = and i32 %a, 2
     //   %c = srl i32 %b, 1
     //   brcond i32 %c ...
     //
     // into
     //
     //   %a = ...
     //   %b = and i32 %a, 2
     //   %c = setcc eq %b, 0
     //   brcond %c ...
     //
     // This applies only when the AND constant value has one bit set and the
     // SRL constant is equal to the log2 of the AND constant. The back-end is
     // smart enough to convert the result into a TEST/JMP sequence.
     SDValue Op0 = N.getOperand(0);
     SDValue Op1 = N.getOperand(1);
 
     if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) {
       SDValue AndOp1 = Op0.getOperand(1);
 
       if (AndOp1.getOpcode() == ISD::Constant) {
         const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();
 
         if (AndConst.isPowerOf2() &&
             cast<ConstantSDNode>(Op1)->getAPIntValue() == AndConst.logBase2()) {
           SDLoc DL(N);
           return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()),
                               Op0, DAG.getConstant(0, DL, Op0.getValueType()),
                               ISD::SETNE);
         }
       }
     }
   }
 
   // Transform br(xor(x, y)) -> br(x != y)
   // Transform br(xor(xor(x,y), 1)) -> br (x == y)
   if (N.getOpcode() == ISD::XOR) {
     // Because we may call this on a speculatively constructed
     // SimplifiedSetCC Node, we need to simplify this node first.
     // Ideally this should be folded into SimplifySetCC and not
     // here. For now, grab a handle to N so we don't lose it from
     // replacements interal to the visit.
     HandleSDNode XORHandle(N);
     while (N.getOpcode() == ISD::XOR) {
       SDValue Tmp = visitXOR(N.getNode());
       // No simplification done.
       if (!Tmp.getNode())
         break;
       // Returning N is form in-visit replacement that may invalidated
       // N. Grab value from Handle.
       if (Tmp.getNode() == N.getNode())
         N = XORHandle.getValue();
       else // Node simplified. Try simplifying again.
         N = Tmp;
     }
 
     if (N.getOpcode() != ISD::XOR)
       return N;
 
     SDNode *TheXor = N.getNode();
 
     SDValue Op0 = TheXor->getOperand(0);
     SDValue Op1 = TheXor->getOperand(1);
 
     if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
       bool Equal = false;
       if (isOneConstant(Op0) && Op0.hasOneUse() &&
           Op0.getOpcode() == ISD::XOR) {
         TheXor = Op0.getNode();
         Equal = true;
       }
 
       EVT SetCCVT = N.getValueType();
       if (LegalTypes)
         SetCCVT = getSetCCResultType(SetCCVT);
       // Replace the uses of XOR with SETCC
       return DAG.getSetCC(SDLoc(TheXor), SetCCVT, Op0, Op1,
                           Equal ? ISD::SETEQ : ISD::SETNE);
     }
   }
 
   return SDValue();
 }
 
 // Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
 //
 SDValue DAGCombiner::visitBR_CC(SDNode *N) {
   CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
   SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);
 
   // If N is a constant we could fold this into a fallthrough or unconditional
   // branch. However that doesn't happen very often in normal code, because
   // Instcombine/SimplifyCFG should have handled the available opportunities.
   // If we did this folding here, it would be necessary to update the
   // MachineBasicBlock CFG, which is awkward.
 
   // Use SimplifySetCC to simplify SETCC's.
   SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
                                CondLHS, CondRHS, CC->get(), SDLoc(N),
                                false);
   if (Simp.getNode()) AddToWorklist(Simp.getNode());
 
   // fold to a simpler setcc
   if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
     return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
                        N->getOperand(0), Simp.getOperand(2),
                        Simp.getOperand(0), Simp.getOperand(1),
                        N->getOperand(4));
 
   return SDValue();
 }
 
 /// Return true if 'Use' is a load or a store that uses N as its base pointer
 /// and that N may be folded in the load / store addressing mode.
 static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
                                     SelectionDAG &DAG,
                                     const TargetLowering &TLI) {
   EVT VT;
   unsigned AS;
 
   if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(Use)) {
     if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
       return false;
     VT = LD->getMemoryVT();
     AS = LD->getAddressSpace();
   } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(Use)) {
     if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
       return false;
     VT = ST->getMemoryVT();
     AS = ST->getAddressSpace();
   } else
     return false;
 
   TargetLowering::AddrMode AM;
   if (N->getOpcode() == ISD::ADD) {
     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
     if (Offset)
       // [reg +/- imm]
       AM.BaseOffs = Offset->getSExtValue();
     else
       // [reg +/- reg]
       AM.Scale = 1;
   } else if (N->getOpcode() == ISD::SUB) {
     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
     if (Offset)
       // [reg +/- imm]
       AM.BaseOffs = -Offset->getSExtValue();
     else
       // [reg +/- reg]
       AM.Scale = 1;
   } else
     return false;
 
   return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
                                    VT.getTypeForEVT(*DAG.getContext()), AS);
 }
 
 /// Try turning a load/store into a pre-indexed load/store when the base
 /// pointer is an add or subtract and it has other uses besides the load/store.
 /// After the transformation, the new indexed load/store has effectively folded
 /// the add/subtract in and all of its other uses are redirected to the
 /// new load/store.
 bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   if (Level < AfterLegalizeDAG)
     return false;
 
   bool isLoad = true;
   SDValue Ptr;
   EVT VT;
   if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
     if (LD->isIndexed())
       return false;
     VT = LD->getMemoryVT();
     if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) &&
         !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT))
       return false;
     Ptr = LD->getBasePtr();
   } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
     if (ST->isIndexed())
       return false;
     VT = ST->getMemoryVT();
     if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) &&
         !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT))
       return false;
     Ptr = ST->getBasePtr();
     isLoad = false;
   } else {
     return false;
   }
 
   // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
   // out.  There is no reason to make this a preinc/predec.
   if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
       Ptr.getNode()->hasOneUse())
     return false;
 
   // Ask the target to do addressing mode selection.
   SDValue BasePtr;
   SDValue Offset;
   ISD::MemIndexedMode AM = ISD::UNINDEXED;
   if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
     return false;
 
   // Backends without true r+i pre-indexed forms may need to pass a
   // constant base with a variable offset so that constant coercion
   // will work with the patterns in canonical form.
   bool Swapped = false;
   if (isa<ConstantSDNode>(BasePtr)) {
     std::swap(BasePtr, Offset);
     Swapped = true;
   }
 
   // Don't create a indexed load / store with zero offset.
   if (isNullConstant(Offset))
     return false;
 
   // Try turning it into a pre-indexed load / store except when:
   // 1) The new base ptr is a frame index.
   // 2) If N is a store and the new base ptr is either the same as or is a
   //    predecessor of the value being stored.
   // 3) Another use of old base ptr is a predecessor of N. If ptr is folded
   //    that would create a cycle.
   // 4) All uses are load / store ops that use it as old base ptr.
 
   // Check #1.  Preinc'ing a frame index would require copying the stack pointer
   // (plus the implicit offset) to a register to preinc anyway.
   if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
     return false;
 
   // Check #2.
   if (!isLoad) {
     SDValue Val = cast<StoreSDNode>(N)->getValue();
     if (Val == BasePtr || BasePtr.getNode()->isPredecessorOf(Val.getNode()))
       return false;
   }
 
   // Caches for hasPredecessorHelper.
   SmallPtrSet<const SDNode *, 32> Visited;
   SmallVector<const SDNode *, 16> Worklist;
   Worklist.push_back(N);
 
   // If the offset is a constant, there may be other adds of constants that
   // can be folded with this one. We should do this to avoid having to keep
   // a copy of the original base pointer.
   SmallVector<SDNode *, 16> OtherUses;
   if (isa<ConstantSDNode>(Offset))
     for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(),
                               UE = BasePtr.getNode()->use_end();
          UI != UE; ++UI) {
       SDUse &Use = UI.getUse();
       // Skip the use that is Ptr and uses of other results from BasePtr's
       // node (important for nodes that return multiple results).
       if (Use.getUser() == Ptr.getNode() || Use != BasePtr)
         continue;
 
       if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist))
         continue;
 
       if (Use.getUser()->getOpcode() != ISD::ADD &&
           Use.getUser()->getOpcode() != ISD::SUB) {
         OtherUses.clear();
         break;
       }
 
       SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1);
       if (!isa<ConstantSDNode>(Op1)) {
         OtherUses.clear();
         break;
       }
 
       // FIXME: In some cases, we can be smarter about this.
       if (Op1.getValueType() != Offset.getValueType()) {
         OtherUses.clear();
         break;
       }
 
       OtherUses.push_back(Use.getUser());
     }
 
   if (Swapped)
     std::swap(BasePtr, Offset);
 
   // Now check for #3 and #4.
   bool RealUse = false;
 
   for (SDNode *Use : Ptr.getNode()->uses()) {
     if (Use == N)
       continue;
     if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
       return false;
 
     // If Ptr may be folded in addressing mode of other use, then it's
     // not profitable to do this transformation.
     if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
       RealUse = true;
   }
 
   if (!RealUse)
     return false;
 
   SDValue Result;
   if (isLoad)
     Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
                                 BasePtr, Offset, AM);
   else
     Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
                                  BasePtr, Offset, AM);
   ++PreIndexedNodes;
   ++NodesCombined;
   LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
              Result.getNode()->dump(&DAG); dbgs() << '\n');
   WorklistRemover DeadNodes(*this);
   if (isLoad) {
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
   } else {
     DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
   }
 
   // Finally, since the node is now dead, remove it from the graph.
   deleteAndRecombine(N);
 
   if (Swapped)
     std::swap(BasePtr, Offset);
 
   // Replace other uses of BasePtr that can be updated to use Ptr
   for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
     unsigned OffsetIdx = 1;
     if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
       OffsetIdx = 0;
     assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
            BasePtr.getNode() && "Expected BasePtr operand");
 
     // We need to replace ptr0 in the following expression:
     //   x0 * offset0 + y0 * ptr0 = t0
     // knowing that
     //   x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
     //
     // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
     // indexed load/store and the expression that needs to be re-written.
     //
     // Therefore, we have:
     //   t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1
 
     ConstantSDNode *CN =
       cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
     int X0, X1, Y0, Y1;
     const APInt &Offset0 = CN->getAPIntValue();
     APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();
 
     X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
     Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
     X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
     Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;
 
     unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD;
 
     APInt CNV = Offset0;
     if (X0 < 0) CNV = -CNV;
     if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1;
     else CNV = CNV - Offset1;
 
     SDLoc DL(OtherUses[i]);
 
     // We can now generate the new expression.
     SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
     SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0);
 
     SDValue NewUse = DAG.getNode(Opcode,
                                  DL,
                                  OtherUses[i]->getValueType(0), NewOp1, NewOp2);
     DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
     deleteAndRecombine(OtherUses[i]);
   }
 
   // Replace the uses of Ptr with uses of the updated base value.
   DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
   deleteAndRecombine(Ptr.getNode());
   AddToWorklist(Result.getNode());
 
   return true;
 }
 
 /// Try to combine a load/store with a add/sub of the base pointer node into a
 /// post-indexed load/store. The transformation folded the add/subtract into the
 /// new indexed load/store effectively and all of its uses are redirected to the
 /// new load/store.
 bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
   if (Level < AfterLegalizeDAG)
     return false;
 
   bool isLoad = true;
   SDValue Ptr;
   EVT VT;
   if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
     if (LD->isIndexed())
       return false;
     VT = LD->getMemoryVT();
     if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) &&
         !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT))
       return false;
     Ptr = LD->getBasePtr();
   } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
     if (ST->isIndexed())
       return false;
     VT = ST->getMemoryVT();
     if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) &&
         !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT))
       return false;
     Ptr = ST->getBasePtr();
     isLoad = false;
   } else {
     return false;
   }
 
   if (Ptr.getNode()->hasOneUse())
     return false;
 
   for (SDNode *Op : Ptr.getNode()->uses()) {
     if (Op == N ||
         (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
       continue;
 
     SDValue BasePtr;
     SDValue Offset;
     ISD::MemIndexedMode AM = ISD::UNINDEXED;
     if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) {
       // Don't create a indexed load / store with zero offset.
       if (isNullConstant(Offset))
         continue;
 
       // Try turning it into a post-indexed load / store except when
       // 1) All uses are load / store ops that use it as base ptr (and
       //    it may be folded as addressing mmode).
       // 2) Op must be independent of N, i.e. Op is neither a predecessor
       //    nor a successor of N. Otherwise, if Op is folded that would
       //    create a cycle.
 
       if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
         continue;
 
       // Check for #1.
       bool TryNext = false;
       for (SDNode *Use : BasePtr.getNode()->uses()) {
         if (Use == Ptr.getNode())
           continue;
 
         // If all the uses are load / store addresses, then don't do the
         // transformation.
         if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){
           bool RealUse = false;
           for (SDNode *UseUse : Use->uses()) {
             if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
               RealUse = true;
           }
 
           if (!RealUse) {
             TryNext = true;
             break;
           }
         }
       }
 
       if (TryNext)
         continue;
 
       // Check for #2
       if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) {
         SDValue Result = isLoad
           ? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
                                BasePtr, Offset, AM)
           : DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
                                 BasePtr, Offset, AM);
         ++PostIndexedNodes;
         ++NodesCombined;
         LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
                    dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
                    dbgs() << '\n');
         WorklistRemover DeadNodes(*this);
         if (isLoad) {
           DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
           DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
         } else {
           DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
         }
 
         // Finally, since the node is now dead, remove it from the graph.
         deleteAndRecombine(N);
 
         // Replace the uses of Use with uses of the updated base value.
         DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
                                       Result.getValue(isLoad ? 1 : 0));
         deleteAndRecombine(Op);
         return true;
       }
     }
   }
 
   return false;
 }
 
 /// Return the base-pointer arithmetic from an indexed \p LD.
 SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   assert(AM != ISD::UNINDEXED);
   SDValue BP = LD->getOperand(1);
   SDValue Inc = LD->getOperand(2);
 
   // Some backends use TargetConstants for load offsets, but don't expect
   // TargetConstants in general ADD nodes. We can convert these constants into
   // regular Constants (if the constant is not opaque).
   assert((Inc.getOpcode() != ISD::TargetConstant ||
           !cast<ConstantSDNode>(Inc)->isOpaque()) &&
          "Cannot split out indexing using opaque target constants");
   if (Inc.getOpcode() == ISD::TargetConstant) {
     ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
     Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc),
                           ConstInc->getValueType(0));
   }
 
   unsigned Opc =
       (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
   return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
 }
 
 SDValue DAGCombiner::visitLOAD(SDNode *N) {
   LoadSDNode *LD  = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
   SDValue Ptr   = LD->getBasePtr();
 
   // If load is not volatile and there are no uses of the loaded value (and
   // the updated indexed value in case of indexed loads), change uses of the
   // chain value into uses of the chain input (i.e. delete the dead load).
   if (!LD->isVolatile()) {
     if (N->getValueType(1) == MVT::Other) {
       // Unindexed loads.
       if (!N->hasAnyUseOfValue(0)) {
         // It's not safe to use the two value CombineTo variant here. e.g.
         // v1, chain2 = load chain1, loc
         // v2, chain3 = load chain2, loc
         // v3         = add v2, c
         // Now we replace use of chain2 with chain1.  This makes the second load
         // isomorphic to the one we are deleting, and thus makes this load live.
         LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG);
                    dbgs() << "\nWith chain: "; Chain.getNode()->dump(&DAG);
                    dbgs() << "\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
         AddUsersToWorklist(Chain.getNode());
         if (N->use_empty())
           deleteAndRecombine(N);
 
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
     } else {
       // Indexed loads.
       assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
 
       // If this load has an opaque TargetConstant offset, then we cannot split
       // the indexing into an add/sub directly (that TargetConstant may not be
       // valid for a different type of node, and we cannot convert an opaque
       // target constant into a regular constant).
       bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant &&
                        cast<ConstantSDNode>(LD->getOperand(2))->isOpaque();
 
       if (!N->hasAnyUseOfValue(0) &&
           ((MaySplitLoadIndex && !HasOTCInc) || !N->hasAnyUseOfValue(1))) {
         SDValue Undef = DAG.getUNDEF(N->getValueType(0));
         SDValue Index;
         if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) {
           Index = SplitIndexingFromLoad(LD);
           // Try to fold the base pointer arithmetic into subsequent loads and
           // stores.
           AddUsersToWorklist(N);
         } else
           Index = DAG.getUNDEF(N->getValueType(1));
         LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG);
                    dbgs() << "\nWith: "; Undef.getNode()->dump(&DAG);
                    dbgs() << " and 2 other values\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
         deleteAndRecombine(N);
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
     }
   }
 
   // If this load is directly stored, replace the load value with the stored
   // value.
   // TODO: Handle store large -> read small portion.
   // TODO: Handle TRUNCSTORE/LOADEXT
   if (OptLevel != CodeGenOpt::None &&
       ISD::isNormalLoad(N) && !LD->isVolatile()) {
     if (ISD::isNON_TRUNCStore(Chain.getNode())) {
       StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
       if (PrevST->getBasePtr() == Ptr &&
           PrevST->getValue().getValueType() == N->getValueType(0))
         return CombineTo(N, PrevST->getOperand(1), Chain);
     }
   }
 
   // Try to infer better alignment information than the load already has.
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > LD->getAlignment() && LD->getSrcValueOffset() % Align == 0) {
         SDValue NewLoad = DAG.getExtLoad(
             LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
             LD->getPointerInfo(), LD->getMemoryVT(), Align,
             LD->getMemOperand()->getFlags(), LD->getAAInfo());
         // NewLoad will always be N as we are only refining the alignment
         assert(NewLoad.getNode() == N);
         (void)NewLoad;
       }
     }
   }
 
   if (LD->isUnindexed()) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
 
     // If there is a better chain.
     if (Chain != BetterChain) {
       SDValue ReplLoad;
 
       // Replace the chain to void dependency.
       if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
         ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
                                BetterChain, Ptr, LD->getMemOperand());
       } else {
         ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
                                   LD->getValueType(0),
                                   BetterChain, Ptr, LD->getMemoryVT(),
                                   LD->getMemOperand());
       }
 
       // Create token factor to keep old chain connected.
       SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
                                   MVT::Other, Chain, ReplLoad.getValue(1));
 
       // Replace uses with load result and token factor
       return CombineTo(N, ReplLoad.getValue(0), Token);
     }
   }
 
   // Try transforming N to an indexed load.
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
 
   // Try to slice up N to more direct loads if the slices are mapped to
   // different register banks or pairing can take place.
   if (SliceUpLoad(N))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
 namespace {
 
 /// Helper structure used to slice a load in smaller loads.
 /// Basically a slice is obtained from the following sequence:
 /// Origin = load Ty1, Base
 /// Shift = srl Ty1 Origin, CstTy Amount
 /// Inst = trunc Shift to Ty2
 ///
 /// Then, it will be rewritten into:
 /// Slice = load SliceTy, Base + SliceOffset
 /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
 ///
 /// SliceTy is deduced from the number of bits that are actually used to
 /// build Inst.
 struct LoadedSlice {
   /// Helper structure used to compute the cost of a slice.
   struct Cost {
     /// Are we optimizing for code size.
     bool ForCodeSize;
 
     /// Various cost.
     unsigned Loads = 0;
     unsigned Truncates = 0;
     unsigned CrossRegisterBanksCopies = 0;
     unsigned ZExts = 0;
     unsigned Shift = 0;
 
     Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {}
 
     /// Get the cost of one isolated slice.
     Cost(const LoadedSlice &LS, bool ForCodeSize = false)
         : ForCodeSize(ForCodeSize), Loads(1) {
       EVT TruncType = LS.Inst->getValueType(0);
       EVT LoadedType = LS.getLoadedType();
       if (TruncType != LoadedType &&
           !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
         ZExts = 1;
     }
 
     /// Account for slicing gain in the current cost.
     /// Slicing provide a few gains like removing a shift or a
     /// truncate. This method allows to grow the cost of the original
     /// load with the gain from this slice.
     void addSliceGain(const LoadedSlice &LS) {
       // Each slice saves a truncate.
       const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
       if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
                               LS.Inst->getValueType(0)))
         ++Truncates;
       // If there is a shift amount, this slice gets rid of it.
       if (LS.Shift)
         ++Shift;
       // If this slice can merge a cross register bank copy, account for it.
       if (LS.canMergeExpensiveCrossRegisterBankCopy())
         ++CrossRegisterBanksCopies;
     }
 
     Cost &operator+=(const Cost &RHS) {
       Loads += RHS.Loads;
       Truncates += RHS.Truncates;
       CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
       ZExts += RHS.ZExts;
       Shift += RHS.Shift;
       return *this;
     }
 
     bool operator==(const Cost &RHS) const {
       return Loads == RHS.Loads && Truncates == RHS.Truncates &&
              CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
              ZExts == RHS.ZExts && Shift == RHS.Shift;
     }
 
     bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
 
     bool operator<(const Cost &RHS) const {
       // Assume cross register banks copies are as expensive as loads.
       // FIXME: Do we want some more target hooks?
       unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
       unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
       // Unless we are optimizing for code size, consider the
       // expensive operation first.
       if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
         return ExpensiveOpsLHS < ExpensiveOpsRHS;
       return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
              (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
     }
 
     bool operator>(const Cost &RHS) const { return RHS < *this; }
 
     bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
 
     bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
   };
 
   // The last instruction that represent the slice. This should be a
   // truncate instruction.
   SDNode *Inst;
 
   // The original load instruction.
   LoadSDNode *Origin;
 
   // The right shift amount in bits from the original load.
   unsigned Shift;
 
   // The DAG from which Origin came from.
   // This is used to get some contextual information about legal types, etc.
   SelectionDAG *DAG;
 
   LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr,
               unsigned Shift = 0, SelectionDAG *DAG = nullptr)
       : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
 
   /// Get the bits used in a chunk of bits \p BitWidth large.
   /// \return Result is \p BitWidth and has used bits set to 1 and
   ///         not used bits set to 0.
   APInt getUsedBits() const {
     // Reproduce the trunc(lshr) sequence:
     // - Start from the truncated value.
     // - Zero extend to the desired bit width.
     // - Shift left.
     assert(Origin && "No original load to compare against.");
     unsigned BitWidth = Origin->getValueSizeInBits(0);
     assert(Inst && "This slice is not bound to an instruction");
     assert(Inst->getValueSizeInBits(0) <= BitWidth &&
            "Extracted slice is bigger than the whole type!");
     APInt UsedBits(Inst->getValueSizeInBits(0), 0);
     UsedBits.setAllBits();
     UsedBits = UsedBits.zext(BitWidth);
     UsedBits <<= Shift;
     return UsedBits;
   }
 
   /// Get the size of the slice to be loaded in bytes.
   unsigned getLoadedSize() const {
     unsigned SliceSize = getUsedBits().countPopulation();
     assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
     return SliceSize / 8;
   }
 
   /// Get the type that will be loaded for this slice.
   /// Note: This may not be the final type for the slice.
   EVT getLoadedType() const {
     assert(DAG && "Missing context");
     LLVMContext &Ctxt = *DAG->getContext();
     return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
   }
 
   /// Get the alignment of the load used for this slice.
   unsigned getAlignment() const {
     unsigned Alignment = Origin->getAlignment();
     unsigned Offset = getOffsetFromBase();
     if (Offset != 0)
       Alignment = MinAlign(Alignment, Alignment + Offset);
     return Alignment;
   }
 
   /// Check if this slice can be rewritten with legal operations.
   bool isLegal() const {
     // An invalid slice is not legal.
     if (!Origin || !Inst || !DAG)
       return false;
 
     // Offsets are for indexed load only, we do not handle that.
     if (!Origin->getOffset().isUndef())
       return false;
 
     const TargetLowering &TLI = DAG->getTargetLoweringInfo();
 
     // Check that the type is legal.
     EVT SliceType = getLoadedType();
     if (!TLI.isTypeLegal(SliceType))
       return false;
 
     // Check that the load is legal for this type.
     if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
       return false;
 
     // Check that the offset can be computed.
     // 1. Check its type.
     EVT PtrType = Origin->getBasePtr().getValueType();
     if (PtrType == MVT::Untyped || PtrType.isExtended())
       return false;
 
     // 2. Check that it fits in the immediate.
     if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
       return false;
 
     // 3. Check that the computation is legal.
     if (!TLI.isOperationLegal(ISD::ADD, PtrType))
       return false;
 
     // Check that the zext is legal if it needs one.
     EVT TruncateType = Inst->getValueType(0);
     if (TruncateType != SliceType &&
         !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
       return false;
 
     return true;
   }
 
   /// Get the offset in bytes of this slice in the original chunk of
   /// bits.
   /// \pre DAG != nullptr.
   uint64_t getOffsetFromBase() const {
     assert(DAG && "Missing context.");
     bool IsBigEndian = DAG->getDataLayout().isBigEndian();
     assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
     uint64_t Offset = Shift / 8;
     unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
     assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
            "The size of the original loaded type is not a multiple of a"
            " byte.");
     // If Offset is bigger than TySizeInBytes, it means we are loading all
     // zeros. This should have been optimized before in the process.
     assert(TySizeInBytes > Offset &&
            "Invalid shift amount for given loaded size");
     if (IsBigEndian)
       Offset = TySizeInBytes - Offset - getLoadedSize();
     return Offset;
   }
 
   /// Generate the sequence of instructions to load the slice
   /// represented by this object and redirect the uses of this slice to
   /// this new sequence of instructions.
   /// \pre this->Inst && this->Origin are valid Instructions and this
   /// object passed the legal check: LoadedSlice::isLegal returned true.
   /// \return The last instruction of the sequence used to load the slice.
   SDValue loadSlice() const {
     assert(Inst && Origin && "Unable to replace a non-existing slice.");
     const SDValue &OldBaseAddr = Origin->getBasePtr();
     SDValue BaseAddr = OldBaseAddr;
     // Get the offset in that chunk of bytes w.r.t. the endianness.
     int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
     assert(Offset >= 0 && "Offset too big to fit in int64_t!");
     if (Offset) {
       // BaseAddr = BaseAddr + Offset.
       EVT ArithType = BaseAddr.getValueType();
       SDLoc DL(Origin);
       BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr,
                               DAG->getConstant(Offset, DL, ArithType));
     }
 
     // Create the type of the loaded slice according to its size.
     EVT SliceType = getLoadedType();
 
     // Create the load for the slice.
     SDValue LastInst =
         DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
                      Origin->getPointerInfo().getWithOffset(Offset),
                      getAlignment(), Origin->getMemOperand()->getFlags());
     // If the final type is not the same as the loaded type, this means that
     // we have to pad with zero. Create a zero extend for that.
     EVT FinalType = Inst->getValueType(0);
     if (SliceType != FinalType)
       LastInst =
           DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
     return LastInst;
   }
 
   /// Check if this slice can be merged with an expensive cross register
   /// bank copy. E.g.,
   /// i = load i32
   /// f = bitcast i32 i to float
   bool canMergeExpensiveCrossRegisterBankCopy() const {
     if (!Inst || !Inst->hasOneUse())
       return false;
     SDNode *Use = *Inst->use_begin();
     if (Use->getOpcode() != ISD::BITCAST)
       return false;
     assert(DAG && "Missing context");
     const TargetLowering &TLI = DAG->getTargetLoweringInfo();
     EVT ResVT = Use->getValueType(0);
     const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
     const TargetRegisterClass *ArgRC =
         TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
     if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
       return false;
 
     // At this point, we know that we perform a cross-register-bank copy.
     // Check if it is expensive.
     const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
     // Assume bitcasts are cheap, unless both register classes do not
     // explicitly share a common sub class.
     if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
       return false;
 
     // Check if it will be merged with the load.
     // 1. Check the alignment constraint.
     unsigned RequiredAlignment = DAG->getDataLayout().getABITypeAlignment(
         ResVT.getTypeForEVT(*DAG->getContext()));
 
     if (RequiredAlignment > getAlignment())
       return false;
 
     // 2. Check that the load is a legal operation for that type.
     if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
       return false;
 
     // 3. Check that we do not have a zext in the way.
     if (Inst->getValueType(0) != getLoadedType())
       return false;
 
     return true;
   }
 };
 
 } // end anonymous namespace
 
 /// Check that all bits set in \p UsedBits form a dense region, i.e.,
 /// \p UsedBits looks like 0..0 1..1 0..0.
 static bool areUsedBitsDense(const APInt &UsedBits) {
   // If all the bits are one, this is dense!
   if (UsedBits.isAllOnesValue())
     return true;
 
   // Get rid of the unused bits on the right.
   APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
   // Get rid of the unused bits on the left.
   if (NarrowedUsedBits.countLeadingZeros())
     NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
   // Check that the chunk of bits is completely used.
   return NarrowedUsedBits.isAllOnesValue();
 }
 
 /// Check whether or not \p First and \p Second are next to each other
 /// in memory. This means that there is no hole between the bits loaded
 /// by \p First and the bits loaded by \p Second.
 static bool areSlicesNextToEachOther(const LoadedSlice &First,
                                      const LoadedSlice &Second) {
   assert(First.Origin == Second.Origin && First.Origin &&
          "Unable to match different memory origins.");
   APInt UsedBits = First.getUsedBits();
   assert((UsedBits & Second.getUsedBits()) == 0 &&
          "Slices are not supposed to overlap.");
   UsedBits |= Second.getUsedBits();
   return areUsedBitsDense(UsedBits);
 }
 
 /// Adjust the \p GlobalLSCost according to the target
 /// paring capabilities and the layout of the slices.
 /// \pre \p GlobalLSCost should account for at least as many loads as
 /// there is in the slices in \p LoadedSlices.
 static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
                                  LoadedSlice::Cost &GlobalLSCost) {
   unsigned NumberOfSlices = LoadedSlices.size();
   // If there is less than 2 elements, no pairing is possible.
   if (NumberOfSlices < 2)
     return;
 
   // Sort the slices so that elements that are likely to be next to each
   // other in memory are next to each other in the list.
   llvm::sort(LoadedSlices.begin(), LoadedSlices.end(),
              [](const LoadedSlice &LHS, const LoadedSlice &RHS) {
     assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
     return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
   });
   const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
   // First (resp. Second) is the first (resp. Second) potentially candidate
   // to be placed in a paired load.
   const LoadedSlice *First = nullptr;
   const LoadedSlice *Second = nullptr;
   for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
                 // Set the beginning of the pair.
                                                            First = Second) {
     Second = &LoadedSlices[CurrSlice];
 
     // If First is NULL, it means we start a new pair.
     // Get to the next slice.
     if (!First)
       continue;
 
     EVT LoadedType = First->getLoadedType();
 
     // If the types of the slices are different, we cannot pair them.
     if (LoadedType != Second->getLoadedType())
       continue;
 
     // Check if the target supplies paired loads for this type.
     unsigned RequiredAlignment = 0;
     if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
       // move to the next pair, this type is hopeless.
       Second = nullptr;
       continue;
     }
     // Check if we meet the alignment requirement.
     if (RequiredAlignment > First->getAlignment())
       continue;
 
     // Check that both loads are next to each other in memory.
     if (!areSlicesNextToEachOther(*First, *Second))
       continue;
 
     assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
     --GlobalLSCost.Loads;
     // Move to the next pair.
     Second = nullptr;
   }
 }
 
 /// Check the profitability of all involved LoadedSlice.
 /// Currently, it is considered profitable if there is exactly two
 /// involved slices (1) which are (2) next to each other in memory, and
 /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
 ///
 /// Note: The order of the elements in \p LoadedSlices may be modified, but not
 /// the elements themselves.
 ///
 /// FIXME: When the cost model will be mature enough, we can relax
 /// constraints (1) and (2).
 static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
                                 const APInt &UsedBits, bool ForCodeSize) {
   unsigned NumberOfSlices = LoadedSlices.size();
   if (StressLoadSlicing)
     return NumberOfSlices > 1;
 
   // Check (1).
   if (NumberOfSlices != 2)
     return false;
 
   // Check (2).
   if (!areUsedBitsDense(UsedBits))
     return false;
 
   // Check (3).
   LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
   // The original code has one big load.
   OrigCost.Loads = 1;
   for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
     const LoadedSlice &LS = LoadedSlices[CurrSlice];
     // Accumulate the cost of all the slices.
     LoadedSlice::Cost SliceCost(LS, ForCodeSize);
     GlobalSlicingCost += SliceCost;
 
     // Account as cost in the original configuration the gain obtained
     // with the current slices.
     OrigCost.addSliceGain(LS);
   }
 
   // If the target supports paired load, adjust the cost accordingly.
   adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
   return OrigCost > GlobalSlicingCost;
 }
 
 /// If the given load, \p LI, is used only by trunc or trunc(lshr)
 /// operations, split it in the various pieces being extracted.
 ///
 /// This sort of thing is introduced by SROA.
 /// This slicing takes care not to insert overlapping loads.
 /// \pre LI is a simple load (i.e., not an atomic or volatile load).
 bool DAGCombiner::SliceUpLoad(SDNode *N) {
   if (Level < AfterLegalizeDAG)
     return false;
 
   LoadSDNode *LD = cast<LoadSDNode>(N);
   if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
       !LD->getValueType(0).isInteger())
     return false;
 
   // Keep track of already used bits to detect overlapping values.
   // In that case, we will just abort the transformation.
   APInt UsedBits(LD->getValueSizeInBits(0), 0);
 
   SmallVector<LoadedSlice, 4> LoadedSlices;
 
   // Check if this load is used as several smaller chunks of bits.
   // Basically, look for uses in trunc or trunc(lshr) and record a new chain
   // of computation for each trunc.
   for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
        UI != UIEnd; ++UI) {
     // Skip the uses of the chain.
     if (UI.getUse().getResNo() != 0)
       continue;
 
     SDNode *User = *UI;
     unsigned Shift = 0;
 
     // Check if this is a trunc(lshr).
     if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
         isa<ConstantSDNode>(User->getOperand(1))) {
       Shift = User->getConstantOperandVal(1);
       User = *User->use_begin();
     }
 
     // At this point, User is a Truncate, iff we encountered, trunc or
     // trunc(lshr).
     if (User->getOpcode() != ISD::TRUNCATE)
       return false;
 
     // The width of the type must be a power of 2 and greater than 8-bits.
     // Otherwise the load cannot be represented in LLVM IR.
     // Moreover, if we shifted with a non-8-bits multiple, the slice
     // will be across several bytes. We do not support that.
     unsigned Width = User->getValueSizeInBits(0);
     if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
       return false;
 
     // Build the slice for this chain of computations.
     LoadedSlice LS(User, LD, Shift, &DAG);
     APInt CurrentUsedBits = LS.getUsedBits();
 
     // Check if this slice overlaps with another.
     if ((CurrentUsedBits & UsedBits) != 0)
       return false;
     // Update the bits used globally.
     UsedBits |= CurrentUsedBits;
 
     // Check if the new slice would be legal.
     if (!LS.isLegal())
       return false;
 
     // Record the slice.
     LoadedSlices.push_back(LS);
   }
 
   // Abort slicing if it does not seem to be profitable.
   if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
     return false;
 
   ++SlicedLoads;
 
   // Rewrite each chain to use an independent load.
   // By construction, each chain can be represented by a unique load.
 
   // Prepare the argument for the new token factor for all the slices.
   SmallVector<SDValue, 8> ArgChains;
   for (SmallVectorImpl<LoadedSlice>::const_iterator
            LSIt = LoadedSlices.begin(),
            LSItEnd = LoadedSlices.end();
        LSIt != LSItEnd; ++LSIt) {
     SDValue SliceInst = LSIt->loadSlice();
     CombineTo(LSIt->Inst, SliceInst, true);
     if (SliceInst.getOpcode() != ISD::LOAD)
       SliceInst = SliceInst.getOperand(0);
     assert(SliceInst->getOpcode() == ISD::LOAD &&
            "It takes more than a zext to get to the loaded slice!!");
     ArgChains.push_back(SliceInst.getValue(1));
   }
 
   SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
                               ArgChains);
   DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
   AddToWorklist(Chain.getNode());
   return true;
 }
 
 /// Check to see if V is (and load (ptr), imm), where the load is having
 /// specific bytes cleared out.  If so, return the byte size being masked out
 /// and the shift amount.
 static std::pair<unsigned, unsigned>
 CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
   std::pair<unsigned, unsigned> Result(0, 0);
 
   // Check for the structure we're looking for.
   if (V->getOpcode() != ISD::AND ||
       !isa<ConstantSDNode>(V->getOperand(1)) ||
       !ISD::isNormalLoad(V->getOperand(0).getNode()))
     return Result;
 
   // Check the chain and pointer.
   LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
   if (LD->getBasePtr() != Ptr) return Result;  // Not from same pointer.
 
   // This only handles simple types.
   if (V.getValueType() != MVT::i16 &&
       V.getValueType() != MVT::i32 &&
       V.getValueType() != MVT::i64)
     return Result;
 
   // Check the constant mask.  Invert it so that the bits being masked out are
   // 0 and the bits being kept are 1.  Use getSExtValue so that leading bits
   // follow the sign bit for uniformity.
   uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
   unsigned NotMaskLZ = countLeadingZeros(NotMask);
   if (NotMaskLZ & 7) return Result;  // Must be multiple of a byte.
   unsigned NotMaskTZ = countTrailingZeros(NotMask);
   if (NotMaskTZ & 7) return Result;  // Must be multiple of a byte.
   if (NotMaskLZ == 64) return Result;  // All zero mask.
 
   // See if we have a continuous run of bits.  If so, we have 0*1+0*
   if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
     return Result;
 
   // Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
   if (V.getValueType() != MVT::i64 && NotMaskLZ)
     NotMaskLZ -= 64-V.getValueSizeInBits();
 
   unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
   switch (MaskedBytes) {
   case 1:
   case 2:
   case 4: break;
   default: return Result; // All one mask, or 5-byte mask.
   }
 
   // Verify that the first bit starts at a multiple of mask so that the access
   // is aligned the same as the access width.
   if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;
 
   // For narrowing to be valid, it must be the case that the load the
   // immediately preceeding memory operation before the store.
   if (LD == Chain.getNode())
     ; // ok.
   else if (Chain->getOpcode() == ISD::TokenFactor &&
            SDValue(LD, 1).hasOneUse()) {
     // LD has only 1 chain use so they are no indirect dependencies.
     bool isOk = false;
     for (const SDValue &ChainOp : Chain->op_values())
       if (ChainOp.getNode() == LD) {
         isOk = true;
         break;
       }
     if (!isOk)
       return Result;
   } else
     return Result; // Fail.
 
   Result.first = MaskedBytes;
   Result.second = NotMaskTZ/8;
   return Result;
 }
 
 /// Check to see if IVal is something that provides a value as specified by
 /// MaskInfo. If so, replace the specified store with a narrower store of
 /// truncated IVal.
 static SDNode *
 ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
                                 SDValue IVal, StoreSDNode *St,
                                 DAGCombiner *DC) {
   unsigned NumBytes = MaskInfo.first;
   unsigned ByteShift = MaskInfo.second;
   SelectionDAG &DAG = DC->getDAG();
 
   // Check to see if IVal is all zeros in the part being masked in by the 'or'
   // that uses this.  If not, this is not a replacement.
   APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
                                   ByteShift*8, (ByteShift+NumBytes)*8);
   if (!DAG.MaskedValueIsZero(IVal, Mask)) return nullptr;
 
   // Check that it is legal on the target to do this.  It is legal if the new
   // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
   // legalization.
   MVT VT = MVT::getIntegerVT(NumBytes*8);
   if (!DC->isTypeLegal(VT))
     return nullptr;
 
   // Okay, we can do this!  Replace the 'St' store with a store of IVal that is
   // shifted by ByteShift and truncated down to NumBytes.
   if (ByteShift) {
     SDLoc DL(IVal);
     IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal,
                        DAG.getConstant(ByteShift*8, DL,
                                     DC->getShiftAmountTy(IVal.getValueType())));
   }
 
   // Figure out the offset for the store and the alignment of the access.
   unsigned StOffset;
   unsigned NewAlign = St->getAlignment();
 
   if (DAG.getDataLayout().isLittleEndian())
     StOffset = ByteShift;
   else
     StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;
 
   SDValue Ptr = St->getBasePtr();
   if (StOffset) {
     SDLoc DL(IVal);
     Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(),
                       Ptr, DAG.getConstant(StOffset, DL, Ptr.getValueType()));
     NewAlign = MinAlign(NewAlign, StOffset);
   }
 
   // Truncate down to the new size.
   IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);
 
   ++OpsNarrowed;
   return DAG
       .getStore(St->getChain(), SDLoc(St), IVal, Ptr,
                 St->getPointerInfo().getWithOffset(StOffset), NewAlign)
       .getNode();
 }
 
 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and
 /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
 /// narrowing the load and store if it would end up being a win for performance
 /// or code size.
 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   if (ST->isVolatile())
     return SDValue();
 
   SDValue Chain = ST->getChain();
   SDValue Value = ST->getValue();
   SDValue Ptr   = ST->getBasePtr();
   EVT VT = Value.getValueType();
 
   if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
     return SDValue();
 
   unsigned Opc = Value.getOpcode();
 
   // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
   // is a byte mask indicating a consecutive number of bytes, check to see if
   // Y is known to provide just those bytes.  If so, we try to replace the
   // load + replace + store sequence with a single (narrower) store, which makes
   // the load dead.
   if (Opc == ISD::OR) {
     std::pair<unsigned, unsigned> MaskedLoad;
     MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
     if (MaskedLoad.first)
       if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
                                                   Value.getOperand(1), ST,this))
         return SDValue(NewST, 0);
 
     // Or is commutative, so try swapping X and Y.
     MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
     if (MaskedLoad.first)
       if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
                                                   Value.getOperand(0), ST,this))
         return SDValue(NewST, 0);
   }
 
   if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
       Value.getOperand(1).getOpcode() != ISD::Constant)
     return SDValue();
 
   SDValue N0 = Value.getOperand(0);
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       Chain == SDValue(N0.getNode(), 1)) {
     LoadSDNode *LD = cast<LoadSDNode>(N0);
     if (LD->getBasePtr() != Ptr ||
         LD->getPointerInfo().getAddrSpace() !=
         ST->getPointerInfo().getAddrSpace())
       return SDValue();
 
     // Find the type to narrow it the load / op / store to.
     SDValue N1 = Value.getOperand(1);
     unsigned BitWidth = N1.getValueSizeInBits();
     APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
     if (Opc == ISD::AND)
       Imm ^= APInt::getAllOnesValue(BitWidth);
     if (Imm == 0 || Imm.isAllOnesValue())
       return SDValue();
     unsigned ShAmt = Imm.countTrailingZeros();
     unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
     unsigned NewBW = NextPowerOf2(MSB - ShAmt);
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     // The narrowing should be profitable, the load/store operation should be
     // legal (or custom) and the store size should be equal to the NewVT width.
     while (NewBW < BitWidth &&
            (NewVT.getStoreSizeInBits() != NewBW ||
             !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
             !TLI.isNarrowingProfitable(VT, NewVT))) {
       NewBW = NextPowerOf2(NewBW);
       NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     }
     if (NewBW >= BitWidth)
       return SDValue();
 
     // If the lsb changed does not start at the type bitwidth boundary,
     // start at the previous one.
     if (ShAmt % NewBW)
       ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
     APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
                                    std::min(BitWidth, ShAmt + NewBW));
     if ((Imm & Mask) == Imm) {
       APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
       if (Opc == ISD::AND)
         NewImm ^= APInt::getAllOnesValue(NewBW);
       uint64_t PtrOff = ShAmt / 8;
       // For big endian targets, we need to adjust the offset to the pointer to
       // load the correct bytes.
       if (DAG.getDataLayout().isBigEndian())
         PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
 
       unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
       Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
       if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy))
         return SDValue();
 
       SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LD),
                                    Ptr.getValueType(), Ptr,
                                    DAG.getConstant(PtrOff, SDLoc(LD),
                                                    Ptr.getValueType()));
       SDValue NewLD =
           DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
                       LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
                       LD->getMemOperand()->getFlags(), LD->getAAInfo());
       SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
                                    DAG.getConstant(NewImm, SDLoc(Value),
                                                    NewVT));
       SDValue NewST =
           DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
                        ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
 
       AddToWorklist(NewPtr.getNode());
       AddToWorklist(NewLD.getNode());
       AddToWorklist(NewVal.getNode());
       WorklistRemover DeadNodes(*this);
       DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
       ++OpsNarrowed;
       return NewST;
     }
   }
 
   return SDValue();
 }
 
 /// For a given floating point load / store pair, if the load value isn't used
 /// by any other operations, then consider transforming the pair to integer
 /// load / store operations if the target deems the transformation profitable.
 SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   SDValue Chain = ST->getChain();
   SDValue Value = ST->getValue();
   if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) &&
       Value.hasOneUse() &&
       Chain == SDValue(Value.getNode(), 1)) {
     LoadSDNode *LD = cast<LoadSDNode>(Value);
     EVT VT = LD->getMemoryVT();
     if (!VT.isFloatingPoint() ||
         VT != ST->getMemoryVT() ||
         LD->isNonTemporal() ||
         ST->isNonTemporal() ||
         LD->getPointerInfo().getAddrSpace() != 0 ||
         ST->getPointerInfo().getAddrSpace() != 0)
       return SDValue();
 
     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
     if (!TLI.isOperationLegal(ISD::LOAD, IntVT) ||
         !TLI.isOperationLegal(ISD::STORE, IntVT) ||
         !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) ||
         !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT))
       return SDValue();
 
     unsigned LDAlign = LD->getAlignment();
     unsigned STAlign = ST->getAlignment();
     Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());
     unsigned ABIAlign = DAG.getDataLayout().getABITypeAlignment(IntVTTy);
     if (LDAlign < ABIAlign || STAlign < ABIAlign)
       return SDValue();
 
     SDValue NewLD =
         DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
                     LD->getPointerInfo(), LDAlign);
 
     SDValue NewST =
         DAG.getStore(NewLD.getValue(1), SDLoc(N), NewLD, ST->getBasePtr(),
                      ST->getPointerInfo(), STAlign);
 
     AddToWorklist(NewLD.getNode());
     AddToWorklist(NewST.getNode());
     WorklistRemover DeadNodes(*this);
     DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
     ++LdStFP2Int;
     return NewST;
   }
 
   return SDValue();
 }
 
 // This is a helper function for visitMUL to check the profitability
 // of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
 // MulNode is the original multiply, AddNode is (add x, c1),
 // and ConstNode is c2.
 //
 // If the (add x, c1) has multiple uses, we could increase
 // the number of adds if we make this transformation.
 // It would only be worth doing this if we can remove a
 // multiply in the process. Check for that here.
 // To illustrate:
 //     (A + c1) * c3
 //     (A + c2) * c3
 // We're checking for cases where we have common "c3 * A" expressions.
 bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
                                               SDValue &AddNode,
                                               SDValue &ConstNode) {
   APInt Val;
 
   // If the add only has one use, this would be OK to do.
   if (AddNode.getNode()->hasOneUse())
     return true;
 
   // Walk all the users of the constant with which we're multiplying.
   for (SDNode *Use : ConstNode->uses()) {
     if (Use == MulNode) // This use is the one we're on right now. Skip it.
       continue;
 
     if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
       SDNode *OtherOp;
       SDNode *MulVar = AddNode.getOperand(0).getNode();
 
       // OtherOp is what we're multiplying against the constant.
       if (Use->getOperand(0) == ConstNode)
         OtherOp = Use->getOperand(1).getNode();
       else
         OtherOp = Use->getOperand(0).getNode();
 
       // Check to see if multiply is with the same operand of our "add".
       //
       //     ConstNode  = CONST
       //     Use = ConstNode * A  <-- visiting Use. OtherOp is A.
       //     ...
       //     AddNode  = (A + c1)  <-- MulVar is A.
       //         = AddNode * ConstNode   <-- current visiting instruction.
       //
       // If we make this transformation, we will have a common
       // multiply (ConstNode * A) that we can save.
       if (OtherOp == MulVar)
         return true;
 
       // Now check to see if a future expansion will give us a common
       // multiply.
       //
       //     ConstNode  = CONST
       //     AddNode    = (A + c1)
       //     ...   = AddNode * ConstNode <-- current visiting instruction.
       //     ...
       //     OtherOp = (A + c2)
       //     Use     = OtherOp * ConstNode <-- visiting Use.
       //
       // If we make this transformation, we will have a common
       // multiply (CONST * A) after we also do the same transformation
       // to the "t2" instruction.
       if (OtherOp->getOpcode() == ISD::ADD &&
           DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
           OtherOp->getOperand(0).getNode() == MulVar)
         return true;
     }
   }
 
   // Didn't find a case where this would be profitable.
   return false;
 }
 
 SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
                                          unsigned NumStores) {
   SmallVector<SDValue, 8> Chains;
   SmallPtrSet<const SDNode *, 8> Visited;
   SDLoc StoreDL(StoreNodes[0].MemNode);
 
   for (unsigned i = 0; i < NumStores; ++i) {
     Visited.insert(StoreNodes[i].MemNode);
   }
 
   // don't include nodes that are children
   for (unsigned i = 0; i < NumStores; ++i) {
     if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0)
       Chains.push_back(StoreNodes[i].MemNode->getChain());
   }
 
   assert(Chains.size() > 0 && "Chain should have generated a chain");
   return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains);
 }
 
 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
     bool IsConstantSrc, bool UseVector, bool UseTrunc) {
   // Make sure we have something to merge.
   if (NumStores < 2)
     return false;
 
   // The latest Node in the DAG.
   SDLoc DL(StoreNodes[0].MemNode);
 
   int64_t ElementSizeBits = MemVT.getStoreSizeInBits();
   unsigned SizeInBits = NumStores * ElementSizeBits;
   unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
 
   EVT StoreTy;
   if (UseVector) {
     unsigned Elts = NumStores * NumMemElts;
     // Get the type for the merged vector store.
     StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
   } else
     StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
 
   SDValue StoredVal;
   if (UseVector) {
     if (IsConstantSrc) {
       SmallVector<SDValue, 8> BuildVector;
       for (unsigned I = 0; I != NumStores; ++I) {
         StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
         SDValue Val = St->getValue();
         // If constant is of the wrong type, convert it now.
         if (MemVT != Val.getValueType()) {
           Val = peekThroughBitcast(Val);
           // Deal with constants of wrong size.
           if (ElementSizeBits != Val.getValueSizeInBits()) {
             EVT IntMemVT =
                 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
             if (isa<ConstantFPSDNode>(Val)) {
               // Not clear how to truncate FP values.
               return false;
             } else if (auto *C = dyn_cast<ConstantSDNode>(Val))
               Val = DAG.getConstant(C->getAPIntValue()
                                         .zextOrTrunc(Val.getValueSizeInBits())
                                         .zextOrTrunc(ElementSizeBits),
                                     SDLoc(C), IntMemVT);
           }
           // Make sure correctly size type is the correct type.
           Val = DAG.getBitcast(MemVT, Val);
         }
         BuildVector.push_back(Val);
       }
       StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
                                                : ISD::BUILD_VECTOR,
                               DL, StoreTy, BuildVector);
     } else {
       SmallVector<SDValue, 8> Ops;
       for (unsigned i = 0; i < NumStores; ++i) {
         StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
         SDValue Val = peekThroughBitcast(St->getValue());
         // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of
         // type MemVT. If the underlying value is not the correct
         // type, but it is an extraction of an appropriate vector we
         // can recast Val to be of the correct type. This may require
         // converting between EXTRACT_VECTOR_ELT and
         // EXTRACT_SUBVECTOR.
         if ((MemVT != Val.getValueType()) &&
             (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
              Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) {
           SDValue Vec = Val.getOperand(0);
           EVT MemVTScalarTy = MemVT.getScalarType();
           SDValue Idx = Val.getOperand(1);
           // We may need to add a bitcast here to get types to line up.
           if (MemVTScalarTy != Vec.getValueType()) {
             unsigned Elts = Vec.getValueType().getSizeInBits() /
                             MemVTScalarTy.getSizeInBits();
             if (Val.getValueType().isVector() && MemVT.isVector()) {
               unsigned IdxC = cast<ConstantSDNode>(Idx)->getZExtValue();
               unsigned NewIdx =
                   ((uint64_t)IdxC * MemVT.getVectorNumElements()) / Elts;
               Idx = DAG.getConstant(NewIdx, SDLoc(Val), Idx.getValueType());
             }
             EVT NewVecTy =
                 EVT::getVectorVT(*DAG.getContext(), MemVTScalarTy, Elts);
             Vec = DAG.getBitcast(NewVecTy, Vec);
           }
           auto OpC = (MemVT.isVector()) ? ISD::EXTRACT_SUBVECTOR
                                         : ISD::EXTRACT_VECTOR_ELT;
           Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Idx);
         }
         Ops.push_back(Val);
       }
 
       // Build the extracted vector elements back into a vector.
       StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
                                                : ISD::BUILD_VECTOR,
                               DL, StoreTy, Ops);
     }
   } else {
     // We should always use a vector store when merging extracted vector
     // elements, so this path implies a store of constants.
     assert(IsConstantSrc && "Merged vector elements should use vector store");
 
     APInt StoreInt(SizeInBits, 0);
 
     // Construct a single integer constant which is made of the smaller
     // constant inputs.
     bool IsLE = DAG.getDataLayout().isLittleEndian();
     for (unsigned i = 0; i < NumStores; ++i) {
       unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
 
       SDValue Val = St->getValue();
       Val = peekThroughBitcast(Val);
       StoreInt <<= ElementSizeBits;
       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
         StoreInt |= C->getAPIntValue()
                         .zextOrTrunc(ElementSizeBits)
                         .zextOrTrunc(SizeInBits);
       } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
         StoreInt |= C->getValueAPF()
                         .bitcastToAPInt()
                         .zextOrTrunc(ElementSizeBits)
                         .zextOrTrunc(SizeInBits);
         // If fp truncation is necessary give up for now.
         if (MemVT.getSizeInBits() != ElementSizeBits)
           return false;
       } else {
         llvm_unreachable("Invalid constant element type");
       }
     }
 
     // Create the new Load and Store operations.
     StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
   }
 
   LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
   SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);
 
   // make sure we use trunc store if it's necessary to be legal.
   SDValue NewStore;
   if (!UseTrunc) {
     NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
                             FirstInChain->getPointerInfo(),
                             FirstInChain->getAlignment());
   } else { // Must be realized as a trunc store
     EVT LegalizedStoredValTy =
         TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
     unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits();
     ConstantSDNode *C = cast<ConstantSDNode>(StoredVal);
     SDValue ExtendedStoreVal =
         DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL,
                         LegalizedStoredValTy);
     NewStore = DAG.getTruncStore(
         NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
         FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/,
         FirstInChain->getAlignment(),
         FirstInChain->getMemOperand()->getFlags());
   }
 
   // Replace all merged stores with the new store.
   for (unsigned i = 0; i < NumStores; ++i)
     CombineTo(StoreNodes[i].MemNode, NewStore);
 
   AddToWorklist(NewChain.getNode());
   return true;
 }
 
 void DAGCombiner::getStoreMergeCandidates(
     StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes,
     SDNode *&RootNode) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
   BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
   EVT MemVT = St->getMemoryVT();
 
   SDValue Val = peekThroughBitcast(St->getValue());
   // We must have a base and an offset.
   if (!BasePtr.getBase().getNode())
     return;
 
   // Do not handle stores to undef base pointers.
   if (BasePtr.getBase().isUndef())
     return;
 
   bool IsConstantSrc = isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val);
   bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
                           Val.getOpcode() == ISD::EXTRACT_SUBVECTOR);
   bool IsLoadSrc = isa<LoadSDNode>(Val);
   BaseIndexOffset LBasePtr;
   // Match on loadbaseptr if relevant.
   EVT LoadVT;
   if (IsLoadSrc) {
     auto *Ld = cast<LoadSDNode>(Val);
     LBasePtr = BaseIndexOffset::match(Ld, DAG);
     LoadVT = Ld->getMemoryVT();
     // Load and store should be the same type.
     if (MemVT != LoadVT)
       return;
     // Loads must only have one use.
     if (!Ld->hasNUsesOfValue(1, 0))
       return;
     // The memory operands must not be volatile.
     if (Ld->isVolatile() || Ld->isIndexed())
       return;
   }
   auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
                             int64_t &Offset) -> bool {
     if (Other->isVolatile() || Other->isIndexed())
       return false;
     SDValue Val = peekThroughBitcast(Other->getValue());
     // Allow merging constants of different types as integers.
     bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
                                            : Other->getMemoryVT() != MemVT;
     if (IsLoadSrc) {
       if (NoTypeMatch)
         return false;
       // The Load's Base Ptr must also match
       if (LoadSDNode *OtherLd = dyn_cast<LoadSDNode>(Val)) {
         auto LPtr = BaseIndexOffset::match(OtherLd, DAG);
         if (LoadVT != OtherLd->getMemoryVT())
           return false;
         // Loads must only have one use.
         if (!OtherLd->hasNUsesOfValue(1, 0))
           return false;
         // The memory operands must not be volatile.
         if (OtherLd->isVolatile() || OtherLd->isIndexed())
           return false;
         if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
           return false;
       } else
         return false;
     }
     if (IsConstantSrc) {
       if (NoTypeMatch)
         return false;
       if (!(isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val)))
         return false;
     }
     if (IsExtractVecSrc) {
       // Do not merge truncated stores here.
       if (Other->isTruncatingStore())
         return false;
       if (!MemVT.bitsEq(Val.getValueType()))
         return false;
       if (Val.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
           Val.getOpcode() != ISD::EXTRACT_SUBVECTOR)
         return false;
     }
     Ptr = BaseIndexOffset::match(Other, DAG);
     return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
   };
 
   // We looking for a root node which is an ancestor to all mergable
   // stores. We search up through a load, to our root and then down
   // through all children. For instance we will find Store{1,2,3} if
   // St is Store1, Store2. or Store3 where the root is not a load
   // which always true for nonvolatile ops. TODO: Expand
   // the search to find all valid candidates through multiple layers of loads.
   //
   // Root
   // |-------|-------|
   // Load    Load    Store3
   // |       |
   // Store1   Store2
   //
   // FIXME: We should be able to climb and
   // descend TokenFactors to find candidates as well.
 
   RootNode = St->getChain().getNode();
 
   if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
     RootNode = Ldn->getChain().getNode();
     for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
       if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain
         for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
           if (I2.getOperandNo() == 0)
             if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) {
               BaseIndexOffset Ptr;
               int64_t PtrDiff;
               if (CandidateMatch(OtherST, Ptr, PtrDiff))
                 StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
             }
   } else
     for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
       if (I.getOperandNo() == 0)
         if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
           BaseIndexOffset Ptr;
           int64_t PtrDiff;
           if (CandidateMatch(OtherST, Ptr, PtrDiff))
             StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
         }
 }
 
 // We need to check that merging these stores does not cause a loop in
 // the DAG. Any store candidate may depend on another candidate
 // indirectly through its operand (we already consider dependencies
 // through the chain). Check in parallel by searching up from
 // non-chain operands of candidates.
 bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
     SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
     SDNode *RootNode) {
   // FIXME: We should be able to truncate a full search of
   // predecessors by doing a BFS and keeping tabs the originating
   // stores from which worklist nodes come from in a similar way to
   // TokenFactor simplfication.
 
   SmallPtrSet<const SDNode *, 32> Visited;
   SmallVector<const SDNode *, 8> Worklist;
 
   // RootNode is a predecessor to all candidates so we need not search
   // past it. Add RootNode (peeking through TokenFactors). Do not count
   // these towards size check.
 
   Worklist.push_back(RootNode);
   while (!Worklist.empty()) {
     auto N = Worklist.pop_back_val();
     if (N->getOpcode() == ISD::TokenFactor) {
       for (SDValue Op : N->ops())
         Worklist.push_back(Op.getNode());
     }
     Visited.insert(N);
   }
 
   // Don't count pruning nodes towards max.
   unsigned int Max = 1024 + Visited.size();
   // Search Ops of store candidates.
   for (unsigned i = 0; i < NumStores; ++i) {
     SDNode *N = StoreNodes[i].MemNode;
     // Of the 4 Store Operands:
     //   * Chain (Op 0) -> We have already considered these
     //                    in candidate selection and can be
     //                    safely ignored
     //   * Value (Op 1) -> Cycles may happen (e.g. through load chains)
     //   * Address (Op 2) -> Merged addresses may only vary by a fixed constant
     //                      and so no cycles are possible.
     //   * (Op 3) -> appears to always be undef. Cannot be source of cycle.
     //
     // Thus we need only check predecessors of the value operands.
     auto *Op = N->getOperand(1).getNode();
     if (Visited.insert(Op).second)
       Worklist.push_back(Op);
   }
   // Search through DAG. We can stop early if we find a store node.
   for (unsigned i = 0; i < NumStores; ++i)
     if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
                                      Max))
       return false;
   return true;
 }
 
 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
   if (OptLevel == CodeGenOpt::None)
     return false;
 
   EVT MemVT = St->getMemoryVT();
   int64_t ElementSizeBytes = MemVT.getStoreSize();
   unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
 
   if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
     return false;
 
   bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute(
       Attribute::NoImplicitFloat);
 
   // This function cannot currently deal with non-byte-sized memory sizes.
   if (ElementSizeBytes * 8 != MemVT.getSizeInBits())
     return false;
 
   if (!MemVT.isSimple())
     return false;
 
   // Perform an early exit check. Do not bother looking at stored values that
   // are not constants, loads, or extracted vector elements.
   SDValue StoredVal = peekThroughBitcast(St->getValue());
   bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
   bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
                        isa<ConstantFPSDNode>(StoredVal);
   bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
                           StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);
 
   if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
     return false;
 
   SmallVector<MemOpLink, 8> StoreNodes;
   SDNode *RootNode;
   // Find potential store merge candidates by searching through chain sub-DAG
   getStoreMergeCandidates(St, StoreNodes, RootNode);
 
   // Check if there is anything to merge.
   if (StoreNodes.size() < 2)
     return false;
 
   // Sort the memory operands according to their distance from the
   // base pointer.
   llvm::sort(StoreNodes.begin(), StoreNodes.end(),
              [](MemOpLink LHS, MemOpLink RHS) {
                return LHS.OffsetFromBase < RHS.OffsetFromBase;
              });
 
   // Store Merge attempts to merge the lowest stores. This generally
   // works out as if successful, as the remaining stores are checked
   // after the first collection of stores is merged. However, in the
   // case that a non-mergeable store is found first, e.g., {p[-2],
   // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
   // mergeable cases. To prevent this, we prune such stores from the
   // front of StoreNodes here.
 
   bool RV = false;
   while (StoreNodes.size() > 1) {
     unsigned StartIdx = 0;
     while ((StartIdx + 1 < StoreNodes.size()) &&
            StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
                StoreNodes[StartIdx + 1].OffsetFromBase)
       ++StartIdx;
 
     // Bail if we don't have enough candidates to merge.
     if (StartIdx + 1 >= StoreNodes.size())
       return RV;
 
     if (StartIdx)
       StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);
 
     // Scan the memory operations on the chain and find the first
     // non-consecutive store memory address.
     unsigned NumConsecutiveStores = 1;
     int64_t StartAddress = StoreNodes[0].OffsetFromBase;
     // Check that the addresses are consecutive starting from the second
     // element in the list of stores.
     for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
       int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
       if (CurrAddress - StartAddress != (ElementSizeBytes * i))
         break;
       NumConsecutiveStores = i + 1;
     }
 
     if (NumConsecutiveStores < 2) {
       StoreNodes.erase(StoreNodes.begin(),
                        StoreNodes.begin() + NumConsecutiveStores);
       continue;
     }
 
     // The node with the lowest store address.
     LLVMContext &Context = *DAG.getContext();
     const DataLayout &DL = DAG.getDataLayout();
 
     // Store the constants into memory as one consecutive store.
     if (IsConstantSrc) {
       while (NumConsecutiveStores >= 2) {
         LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
         unsigned FirstStoreAS = FirstInChain->getAddressSpace();
         unsigned FirstStoreAlign = FirstInChain->getAlignment();
         unsigned LastLegalType = 1;
         unsigned LastLegalVectorType = 1;
         bool LastIntegerTrunc = false;
         bool NonZero = false;
         unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
         for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
           StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
           SDValue StoredVal = ST->getValue();
           bool IsElementZero = false;
           if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
             IsElementZero = C->isNullValue();
           else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
             IsElementZero = C->getConstantFPValue()->isNullValue();
           if (IsElementZero) {
             if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
               FirstZeroAfterNonZero = i;
           }
           NonZero |= !IsElementZero;
 
           // Find a legal type for the constant store.
           unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
           EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
           bool IsFast = false;
 
           // Break early when size is too large to be legal.
           if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
             break;
 
           if (TLI.isTypeLegal(StoreTy) &&
               TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
               TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                      FirstStoreAlign, &IsFast) &&
               IsFast) {
             LastIntegerTrunc = false;
             LastLegalType = i + 1;
             // Or check whether a truncstore is legal.
           } else if (TLI.getTypeAction(Context, StoreTy) ==
                      TargetLowering::TypePromoteInteger) {
             EVT LegalizedStoredValTy =
                 TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
             if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
                 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
                 TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                        FirstStoreAlign, &IsFast) &&
                 IsFast) {
               LastIntegerTrunc = true;
               LastLegalType = i + 1;
             }
           }
 
           // We only use vectors if the constant is known to be zero or the
           // target allows it and the function is not marked with the
           // noimplicitfloat attribute.
           if ((!NonZero ||
                TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
               !NoVectors) {
             // Find a legal type for the vector store.
             unsigned Elts = (i + 1) * NumMemElts;
             EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
             if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
                 TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
                 TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
                                        FirstStoreAlign, &IsFast) &&
                 IsFast)
               LastLegalVectorType = i + 1;
           }
         }
 
         bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
         unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
 
         // Check if we found a legal integer type that creates a meaningful
         // merge.
         if (NumElem < 2) {
           // We know that candidate stores are in order and of correct
           // shape. While there is no mergeable sequence from the
           // beginning one may start later in the sequence. The only
           // reason a merge of size N could have failed where another of
           // the same size would not have, is if the alignment has
           // improved or we've dropped a non-zero value. Drop as many
           // candidates as we can here.
           unsigned NumSkip = 1;
           while (
               (NumSkip < NumConsecutiveStores) &&
               (NumSkip < FirstZeroAfterNonZero) &&
               (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
             NumSkip++;
 
           StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
           NumConsecutiveStores -= NumSkip;
           continue;
         }
 
         // Check that we can merge these candidates without causing a cycle.
         if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
                                                       RootNode)) {
           StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
           NumConsecutiveStores -= NumElem;
           continue;
         }
 
         RV |= MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, true,
                                               UseVector, LastIntegerTrunc);
 
         // Remove merged stores for next iteration.
         StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
         NumConsecutiveStores -= NumElem;
       }
       continue;
     }
 
     // When extracting multiple vector elements, try to store them
     // in one vector store rather than a sequence of scalar stores.
     if (IsExtractVecSrc) {
       // Loop on Consecutive Stores on success.
       while (NumConsecutiveStores >= 2) {
         LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
         unsigned FirstStoreAS = FirstInChain->getAddressSpace();
         unsigned FirstStoreAlign = FirstInChain->getAlignment();
         unsigned NumStoresToMerge = 1;
         for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
           // Find a legal type for the vector store.
           unsigned Elts = (i + 1) * NumMemElts;
           EVT Ty =
               EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
           bool IsFast;
 
           // Break early when size is too large to be legal.
           if (Ty.getSizeInBits() > MaximumLegalStoreInBits)
             break;
 
           if (TLI.isTypeLegal(Ty) &&
               TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
               TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
                                      FirstStoreAlign, &IsFast) &&
               IsFast)
             NumStoresToMerge = i + 1;
         }
 
         // Check if we found a legal integer type creating a meaningful
         // merge.
         if (NumStoresToMerge < 2) {
           // We know that candidate stores are in order and of correct
           // shape. While there is no mergeable sequence from the
           // beginning one may start later in the sequence. The only
           // reason a merge of size N could have failed where another of
           // the same size would not have, is if the alignment has
           // improved. Drop as many candidates as we can here.
           unsigned NumSkip = 1;
           while (
               (NumSkip < NumConsecutiveStores) &&
               (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
             NumSkip++;
 
           StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
           NumConsecutiveStores -= NumSkip;
           continue;
         }
 
         // Check that we can merge these candidates without causing a cycle.
         if (!checkMergeStoreCandidatesForDependencies(
                 StoreNodes, NumStoresToMerge, RootNode)) {
           StoreNodes.erase(StoreNodes.begin(),
                            StoreNodes.begin() + NumStoresToMerge);
           NumConsecutiveStores -= NumStoresToMerge;
           continue;
         }
 
         RV |= MergeStoresOfConstantsOrVecElts(
             StoreNodes, MemVT, NumStoresToMerge, false, true, false);
 
         StoreNodes.erase(StoreNodes.begin(),
                          StoreNodes.begin() + NumStoresToMerge);
         NumConsecutiveStores -= NumStoresToMerge;
       }
       continue;
     }
 
     // Below we handle the case of multiple consecutive stores that
     // come from multiple consecutive loads. We merge them into a single
     // wide load and a single wide store.
 
     // Look for load nodes which are used by the stored values.
     SmallVector<MemOpLink, 8> LoadNodes;
 
     // Find acceptable loads. Loads need to have the same chain (token factor),
     // must not be zext, volatile, indexed, and they must be consecutive.
     BaseIndexOffset LdBasePtr;
 
     for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
       StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
       SDValue Val = peekThroughBitcast(St->getValue());
       LoadSDNode *Ld = cast<LoadSDNode>(Val);
 
       BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
       // If this is not the first ptr that we check.
       int64_t LdOffset = 0;
       if (LdBasePtr.getBase().getNode()) {
         // The base ptr must be the same.
         if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
           break;
       } else {
         // Check that all other base pointers are the same as this one.
         LdBasePtr = LdPtr;
       }
 
       // We found a potential memory operand to merge.
       LoadNodes.push_back(MemOpLink(Ld, LdOffset));
     }
 
     while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {
       // If we have load/store pair instructions and we only have two values,
       // don't bother merging.
       unsigned RequiredAlignment;
       if (LoadNodes.size() == 2 &&
           TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
           StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) {
         StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
         LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2);
         break;
       }
       LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
       unsigned FirstStoreAS = FirstInChain->getAddressSpace();
       unsigned FirstStoreAlign = FirstInChain->getAlignment();
       LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
       unsigned FirstLoadAS = FirstLoad->getAddressSpace();
       unsigned FirstLoadAlign = FirstLoad->getAlignment();
 
       // Scan the memory operations on the chain and find the first
       // non-consecutive load memory address. These variables hold the index in
       // the store node array.
 
       unsigned LastConsecutiveLoad = 1;
 
       // This variable refers to the size and not index in the array.
       unsigned LastLegalVectorType = 1;
       unsigned LastLegalIntegerType = 1;
       bool isDereferenceable = true;
       bool DoIntegerTruncate = false;
       StartAddress = LoadNodes[0].OffsetFromBase;
       SDValue FirstChain = FirstLoad->getChain();
       for (unsigned i = 1; i < LoadNodes.size(); ++i) {
         // All loads must share the same chain.
         if (LoadNodes[i].MemNode->getChain() != FirstChain)
           break;
 
         int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
         if (CurrAddress - StartAddress != (ElementSizeBytes * i))
           break;
         LastConsecutiveLoad = i;
 
         if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
           isDereferenceable = false;
 
         // Find a legal type for the vector store.
         unsigned Elts = (i + 1) * NumMemElts;
         EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
 
         // Break early when size is too large to be legal.
         if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
           break;
 
         bool IsFastSt, IsFastLd;
         if (TLI.isTypeLegal(StoreTy) &&
             TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
             TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                    FirstStoreAlign, &IsFastSt) &&
             IsFastSt &&
             TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
                                    FirstLoadAlign, &IsFastLd) &&
             IsFastLd) {
           LastLegalVectorType = i + 1;
         }
 
         // Find a legal type for the integer store.
         unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
         StoreTy = EVT::getIntegerVT(Context, SizeInBits);
         if (TLI.isTypeLegal(StoreTy) &&
             TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
             TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                    FirstStoreAlign, &IsFastSt) &&
             IsFastSt &&
             TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
                                    FirstLoadAlign, &IsFastLd) &&
             IsFastLd) {
           LastLegalIntegerType = i + 1;
           DoIntegerTruncate = false;
           // Or check whether a truncstore and extload is legal.
         } else if (TLI.getTypeAction(Context, StoreTy) ==
                    TargetLowering::TypePromoteInteger) {
           EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);
           if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
               TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
               TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy,
                                  StoreTy) &&
               TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy,
                                  StoreTy) &&
               TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&
               TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                      FirstStoreAlign, &IsFastSt) &&
               IsFastSt &&
               TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
                                      FirstLoadAlign, &IsFastLd) &&
               IsFastLd) {
             LastLegalIntegerType = i + 1;
             DoIntegerTruncate = true;
           }
         }
       }
 
       // Only use vector types if the vector type is larger than the integer
       // type. If they are the same, use integers.
       bool UseVectorTy =
           LastLegalVectorType > LastLegalIntegerType && !NoVectors;
       unsigned LastLegalType =
           std::max(LastLegalVectorType, LastLegalIntegerType);
 
       // We add +1 here because the LastXXX variables refer to location while
       // the NumElem refers to array/index size.
       unsigned NumElem =
           std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
       NumElem = std::min(LastLegalType, NumElem);
 
       if (NumElem < 2) {
         // We know that candidate stores are in order and of correct
         // shape. While there is no mergeable sequence from the
         // beginning one may start later in the sequence. The only
         // reason a merge of size N could have failed where another of
         // the same size would not have is if the alignment or either
         // the load or store has improved. Drop as many candidates as we
         // can here.
         unsigned NumSkip = 1;
         while ((NumSkip < LoadNodes.size()) &&
                (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
                (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
           NumSkip++;
         StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
         LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);
         NumConsecutiveStores -= NumSkip;
         continue;
       }
 
       // Check that we can merge these candidates without causing a cycle.
       if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
                                                     RootNode)) {
         StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
         LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
         NumConsecutiveStores -= NumElem;
         continue;
       }
 
       // Find if it is better to use vectors or integers to load and store
       // to memory.
       EVT JointMemOpVT;
       if (UseVectorTy) {
         // Find a legal type for the vector store.
         unsigned Elts = NumElem * NumMemElts;
         JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
       } else {
         unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
         JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
       }
 
       SDLoc LoadDL(LoadNodes[0].MemNode);
       SDLoc StoreDL(StoreNodes[0].MemNode);
 
       // The merged loads are required to have the same incoming chain, so
       // using the first's chain is acceptable.
 
       SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
       AddToWorklist(NewStoreChain.getNode());
 
       MachineMemOperand::Flags MMOFlags =
           isDereferenceable ? MachineMemOperand::MODereferenceable
                             : MachineMemOperand::MONone;
 
       SDValue NewLoad, NewStore;
       if (UseVectorTy || !DoIntegerTruncate) {
         NewLoad =
             DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
                         FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
                         FirstLoadAlign, MMOFlags);
         NewStore = DAG.getStore(
             NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
             FirstInChain->getPointerInfo(), FirstStoreAlign);
       } else { // This must be the truncstore/extload case
         EVT ExtendedTy =
             TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
         NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,
                                  FirstLoad->getChain(), FirstLoad->getBasePtr(),
                                  FirstLoad->getPointerInfo(), JointMemOpVT,
                                  FirstLoadAlign, MMOFlags);
         NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
                                      FirstInChain->getBasePtr(),
                                      FirstInChain->getPointerInfo(),
                                      JointMemOpVT, FirstInChain->getAlignment(),
                                      FirstInChain->getMemOperand()->getFlags());
       }
 
       // Transfer chain users from old loads to the new load.
       for (unsigned i = 0; i < NumElem; ++i) {
         LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
                                       SDValue(NewLoad.getNode(), 1));
       }
 
       // Replace the all stores with the new store. Recursively remove
       // corresponding value if its no longer used.
       for (unsigned i = 0; i < NumElem; ++i) {
         SDValue Val = StoreNodes[i].MemNode->getOperand(1);
         CombineTo(StoreNodes[i].MemNode, NewStore);
         if (Val.getNode()->use_empty())
           recursivelyDeleteUnusedNodes(Val.getNode());
       }
 
       RV = true;
       StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
       LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
       NumConsecutiveStores -= NumElem;
     }
   }
   return RV;
 }
 
 SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
   SDLoc SL(ST);
   SDValue ReplStore;
 
   // Replace the chain to avoid dependency.
   if (ST->isTruncatingStore()) {
     ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(),
                                   ST->getBasePtr(), ST->getMemoryVT(),
                                   ST->getMemOperand());
   } else {
     ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(),
                              ST->getMemOperand());
   }
 
   // Create token to keep both nodes around.
   SDValue Token = DAG.getNode(ISD::TokenFactor, SL,
                               MVT::Other, ST->getChain(), ReplStore);
 
   // Make sure the new and old chains are cleaned up.
   AddToWorklist(Token.getNode());
 
   // Don't add users to work list.
   return CombineTo(ST, Token, false);
 }
 
 SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
   SDValue Value = ST->getValue();
   if (Value.getOpcode() == ISD::TargetConstantFP)
     return SDValue();
 
   SDLoc DL(ST);
 
   SDValue Chain = ST->getChain();
   SDValue Ptr = ST->getBasePtr();
 
   const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value);
 
   // NOTE: If the original store is volatile, this transform must not increase
   // the number of stores.  For example, on x86-32 an f64 can be stored in one
   // processor operation but an i64 (which is not legal) requires two.  So the
   // transform should not be done in this case.
 
   SDValue Tmp;
   switch (CFP->getSimpleValueType(0).SimpleTy) {
   default:
     llvm_unreachable("Unknown FP type");
   case MVT::f16:    // We don't do this for these yet.
   case MVT::f80:
   case MVT::f128:
   case MVT::ppcf128:
     return SDValue();
   case MVT::f32:
     if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) ||
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
       ;
       Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
                             bitcastToAPInt().getZExtValue(), SDLoc(CFP),
                             MVT::i32);
       return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand());
     }
 
     return SDValue();
   case MVT::f64:
     if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
          !ST->isVolatile()) ||
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
       ;
       Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                             getZExtValue(), SDLoc(CFP), MVT::i64);
       return DAG.getStore(Chain, DL, Tmp,
                           Ptr, ST->getMemOperand());
     }
 
     if (!ST->isVolatile() &&
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
       // Many FP stores are not made apparent until after legalize, e.g. for
       // argument passing.  Since this is so common, custom legalize the
       // 64-bit integer store into two 32-bit stores.
       uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
       SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
       SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
       if (DAG.getDataLayout().isBigEndian())
         std::swap(Lo, Hi);
 
       unsigned Alignment = ST->getAlignment();
       MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
       AAMDNodes AAInfo = ST->getAAInfo();
 
       SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
                                  ST->getAlignment(), MMOFlags, AAInfo);
       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
                         DAG.getConstant(4, DL, Ptr.getValueType()));
       Alignment = MinAlign(Alignment, 4U);
       SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
                                  ST->getPointerInfo().getWithOffset(4),
                                  Alignment, MMOFlags, AAInfo);
       return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
                          St0, St1);
     }
 
     return SDValue();
   }
 }
 
 SDValue DAGCombiner::visitSTORE(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   SDValue Chain = ST->getChain();
   SDValue Value = ST->getValue();
   SDValue Ptr   = ST->getBasePtr();
 
   // If this is a store of a bit convert, store the input value if the
   // resultant store does not need a higher alignment than the original.
   if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
       ST->isUnindexed()) {
     EVT SVT = Value.getOperand(0).getValueType();
     if (((!LegalOperations && !ST->isVolatile()) ||
          TLI.isOperationLegalOrCustom(ISD::STORE, SVT)) &&
         TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) {
       unsigned OrigAlign = ST->getAlignment();
       bool Fast = false;
       if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT,
                                  ST->getAddressSpace(), OrigAlign, &Fast) &&
           Fast) {
         return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
                             ST->getPointerInfo(), OrigAlign,
                             ST->getMemOperand()->getFlags(), ST->getAAInfo());
       }
     }
   }
 
   // Turn 'store undef, Ptr' -> nothing.
   if (Value.isUndef() && ST->isUnindexed())
     return Chain;
 
   // Try to infer better alignment information than the store already has.
   if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > ST->getAlignment() && ST->getSrcValueOffset() % Align == 0) {
         SDValue NewStore =
             DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
                               ST->getMemoryVT(), Align,
                               ST->getMemOperand()->getFlags(), ST->getAAInfo());
         // NewStore will always be N as we are only refining the alignment
         assert(NewStore.getNode() == N);
         (void)NewStore;
       }
     }
   }
 
   // Try transforming a pair floating point load / store ops to integer
   // load / store ops.
   if (SDValue NewST = TransformFPLoadStorePair(N))
     return NewST;
 
   if (ST->isUnindexed()) {
     // Walk up chain skipping non-aliasing memory nodes, on this store and any
     // adjacent stores.
     if (findBetterNeighborChains(ST)) {
       // replaceStoreChain uses CombineTo, which handled all of the worklist
       // manipulation. Return the original node to not do anything else.
       return SDValue(ST, 0);
     }
     Chain = ST->getChain();
   }
 
   // FIXME: is there such a thing as a truncating indexed store?
   if (ST->isTruncatingStore() && ST->isUnindexed() &&
       Value.getValueType().isInteger()) {
     // See if we can simplify the input to this truncstore with knowledge that
     // only the low bits are being used.  For example:
     // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
     SDValue Shorter = DAG.GetDemandedBits(
         Value, APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
                                     ST->getMemoryVT().getScalarSizeInBits()));
     AddToWorklist(Value.getNode());
     if (Shorter.getNode())
       return DAG.getTruncStore(Chain, SDLoc(N), Shorter,
                                Ptr, ST->getMemoryVT(), ST->getMemOperand());
 
     // Otherwise, see if we can simplify the operation with
     // SimplifyDemandedBits, which only works if the value has a single use.
     if (SimplifyDemandedBits(
             Value,
             APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
                                  ST->getMemoryVT().getScalarSizeInBits()))) {
       // Re-visit the store if anything changed and the store hasn't been merged
       // with another node (N is deleted) SimplifyDemandedBits will add Value's
       // node back to the worklist if necessary, but we also need to re-visit
       // the Store node itself.
       if (N->getOpcode() != ISD::DELETED_NODE)
         AddToWorklist(N);
       return SDValue(N, 0);
     }
   }
 
   // If this is a load followed by a store to the same location, then the store
   // is dead/noop.
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
     if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
         ST->isUnindexed() && !ST->isVolatile() &&
         // There can't be any side effects between the load and store, such as
         // a call or store.
         Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
       // The store is dead, remove it.
       return Chain;
     }
   }
 
   if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
     if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
         !ST1->isVolatile() && ST1->getBasePtr() == Ptr &&
         ST->getMemoryVT() == ST1->getMemoryVT()) {
       // If this is a store followed by a store with the same value to the same
       // location, then the store is dead/noop.
       if (ST1->getValue() == Value) {
         // The store is dead, remove it.
         return Chain;
       }
 
       // If this is a store who's preceeding store to the same location
       // and no one other node is chained to that store we can effectively
       // drop the store. Do not remove stores to undef as they may be used as
       // data sinks.
       if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
           !ST1->getBasePtr().isUndef()) {
         // ST1 is fully overwritten and can be elided. Combine with it's chain
         // value.
         CombineTo(ST1, ST1->getChain());
         return SDValue();
       }
     }
   }
 
   // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
   // truncating store.  We can do this even if this is already a truncstore.
   if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
       && Value.getNode()->hasOneUse() && ST->isUnindexed() &&
       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
                             ST->getMemoryVT())) {
     return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
                              Ptr, ST->getMemoryVT(), ST->getMemOperand());
   }
 
   // Always perform this optimization before types are legal. If the target
   // prefers, also try this after legalization to catch stores that were created
   // by intrinsics or other nodes.
   if (!LegalTypes || (TLI.mergeStoresAfterLegalization())) {
     while (true) {
       // There can be multiple store sequences on the same chain.
       // Keep trying to merge store sequences until we are unable to do so
       // or until we merge the last store on the chain.
       bool Changed = MergeConsecutiveStores(ST);
       if (!Changed) break;
       // Return N as merge only uses CombineTo and no worklist clean
       // up is necessary.
       if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N))
         return SDValue(N, 0);
     }
   }
 
   // Try transforming N to an indexed store.
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
 
   // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
   //
   // Make sure to do this only after attempting to merge stores in order to
   //  avoid changing the types of some subset of stores due to visit order,
   //  preventing their merging.
   if (isa<ConstantFPSDNode>(ST->getValue())) {
     if (SDValue NewSt = replaceStoreOfFPConstant(ST))
       return NewSt;
   }
 
   if (SDValue NewSt = splitMergedValStore(ST))
     return NewSt;
 
   return ReduceLoadOpStoreWidth(N);
 }
 
 /// For the instruction sequence of store below, F and I values
 /// are bundled together as an i64 value before being stored into memory.
 /// Sometimes it is more efficent to generate separate stores for F and I,
 /// which can remove the bitwise instructions or sink them to colder places.
 ///
 ///   (store (or (zext (bitcast F to i32) to i64),
 ///              (shl (zext I to i64), 32)), addr)  -->
 ///   (store F, addr) and (store I, addr+4)
 ///
 /// Similarly, splitting for other merged store can also be beneficial, like:
 /// For pair of {i32, i32}, i64 store --> two i32 stores.
 /// For pair of {i32, i16}, i64 store --> two i32 stores.
 /// For pair of {i16, i16}, i32 store --> two i16 stores.
 /// For pair of {i16, i8},  i32 store --> two i16 stores.
 /// For pair of {i8, i8},   i16 store --> two i8 stores.
 ///
 /// We allow each target to determine specifically which kind of splitting is
 /// supported.
 ///
 /// The store patterns are commonly seen from the simple code snippet below
 /// if only std::make_pair(...) is sroa transformed before inlined into hoo.
 ///   void goo(const std::pair<int, float> &);
 ///   hoo() {
 ///     ...
 ///     goo(std::make_pair(tmp, ftmp));
 ///     ...
 ///   }
 ///
 SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
   if (OptLevel == CodeGenOpt::None)
     return SDValue();
 
   SDValue Val = ST->getValue();
   SDLoc DL(ST);
 
   // Match OR operand.
   if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR)
     return SDValue();
 
   // Match SHL operand and get Lower and Higher parts of Val.
   SDValue Op1 = Val.getOperand(0);
   SDValue Op2 = Val.getOperand(1);
   SDValue Lo, Hi;
   if (Op1.getOpcode() != ISD::SHL) {
     std::swap(Op1, Op2);
     if (Op1.getOpcode() != ISD::SHL)
       return SDValue();
   }
   Lo = Op2;
   Hi = Op1.getOperand(0);
   if (!Op1.hasOneUse())
     return SDValue();
 
   // Match shift amount to HalfValBitSize.
   unsigned HalfValBitSize = Val.getValueSizeInBits() / 2;
   ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
   if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize)
     return SDValue();
 
   // Lo and Hi are zero-extended from int with size less equal than 32
   // to i64.
   if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() ||
       !Lo.getOperand(0).getValueType().isScalarInteger() ||
       Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize ||
       Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() ||
       !Hi.getOperand(0).getValueType().isScalarInteger() ||
       Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize)
     return SDValue();
 
   // Use the EVT of low and high parts before bitcast as the input
   // of target query.
   EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST)
                   ? Lo.getOperand(0).getValueType()
                   : Lo.getValueType();
   EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST)
                    ? Hi.getOperand(0).getValueType()
                    : Hi.getValueType();
   if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
     return SDValue();
 
   // Start to split store.
   unsigned Alignment = ST->getAlignment();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
 
   // Change the sizes of Lo and Hi's value types to HalfValBitSize.
   EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
   Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
   Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));
 
   SDValue Chain = ST->getChain();
   SDValue Ptr = ST->getBasePtr();
   // Lower value store.
   SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
                              ST->getAlignment(), MMOFlags, AAInfo);
   Ptr =
       DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
                   DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType()));
   // Higher value store.
   SDValue St1 =
       DAG.getStore(St0, DL, Hi, Ptr,
                    ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
                    Alignment / 2, MMOFlags, AAInfo);
   return St1;
 }
 
 /// Convert a disguised subvector insertion into a shuffle:
 /// insert_vector_elt V, (bitcast X from vector type), IdxC -->
 /// bitcast(shuffle (bitcast V), (extended X), Mask)
 /// Note: We do not use an insert_subvector node because that requires a legal
 /// subvector type.
 SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
   SDValue InsertVal = N->getOperand(1);
   if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
       !InsertVal.getOperand(0).getValueType().isVector())
     return SDValue();
 
   SDValue SubVec = InsertVal.getOperand(0);
   SDValue DestVec = N->getOperand(0);
   EVT SubVecVT = SubVec.getValueType();
   EVT VT = DestVec.getValueType();
   unsigned NumSrcElts = SubVecVT.getVectorNumElements();
   unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
   unsigned NumMaskVals = ExtendRatio * NumSrcElts;
 
   // Step 1: Create a shuffle mask that implements this insert operation. The
   // vector that we are inserting into will be operand 0 of the shuffle, so
   // those elements are just 'i'. The inserted subvector is in the first
   // positions of operand 1 of the shuffle. Example:
   // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
   SmallVector<int, 16> Mask(NumMaskVals);
   for (unsigned i = 0; i != NumMaskVals; ++i) {
     if (i / NumSrcElts == InsIndex)
       Mask[i] = (i % NumSrcElts) + NumMaskVals;
     else
       Mask[i] = i;
   }
 
   // Bail out if the target can not handle the shuffle we want to create.
   EVT SubVecEltVT = SubVecVT.getVectorElementType();
   EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
   if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
     return SDValue();
 
   // Step 2: Create a wide vector from the inserted source vector by appending
   // undefined elements. This is the same size as our destination vector.
   SDLoc DL(N);
   SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
   ConcatOps[0] = SubVec;
   SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);
 
   // Step 3: Shuffle in the padded subvector.
   SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
   SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
   AddToWorklist(PaddedSubV.getNode());
   AddToWorklist(DestVecBC.getNode());
   AddToWorklist(Shuf.getNode());
   return DAG.getBitcast(VT, Shuf);
 }
 
 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDValue InVec = N->getOperand(0);
   SDValue InVal = N->getOperand(1);
   SDValue EltNo = N->getOperand(2);
   SDLoc DL(N);
 
   // If the inserted element is an UNDEF, just use the input vector.
   if (InVal.isUndef())
     return InVec;
 
   EVT VT = InVec.getValueType();
 
   // Remove redundant insertions:
   // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
   if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
     return InVec;
 
   // We must know which element is being inserted for folds below here.
   auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
   if (!IndexC)
     return SDValue();
   unsigned Elt = IndexC->getZExtValue();
 
   if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
     return Shuf;
 
   // Canonicalize insert_vector_elt dag nodes.
   // Example:
   // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
   // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
   //
   // Do this only if the child insert_vector node has one use; also
   // do this only if indices are both constants and Idx1 < Idx0.
   if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
       && isa<ConstantSDNode>(InVec.getOperand(2))) {
     unsigned OtherElt = InVec.getConstantOperandVal(2);
     if (Elt < OtherElt) {
       // Swap nodes.
       SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
                                   InVec.getOperand(0), InVal, EltNo);
       AddToWorklist(NewOp.getNode());
       return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
                          VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
     }
   }
 
   // If we can't generate a legal BUILD_VECTOR, exit
   if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
     return SDValue();
 
   // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   // vector elements.
   SmallVector<SDValue, 8> Ops;
   // Do not combine these two vectors if the output vector will not replace
   // the input vector.
   if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
     Ops.append(InVec.getNode()->op_begin(),
                InVec.getNode()->op_end());
   } else if (InVec.isUndef()) {
     unsigned NElts = VT.getVectorNumElements();
     Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
   } else {
     return SDValue();
   }
 
   // Insert the element
   if (Elt < Ops.size()) {
     // All the operands of BUILD_VECTOR must have the same type;
     // we enforce that here.
     EVT OpVT = Ops[0].getValueType();
     Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
   }
 
   // Return the new vector
   return DAG.getBuildVector(VT, DL, Ops);
 }
 
 SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
     SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) {
   assert(!OriginalLoad->isVolatile());
 
   EVT ResultVT = EVE->getValueType(0);
   EVT VecEltVT = InVecVT.getVectorElementType();
   unsigned Align = OriginalLoad->getAlignment();
   unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
       VecEltVT.getTypeForEVT(*DAG.getContext()));
 
   if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
     return SDValue();
 
   ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
     ISD::NON_EXTLOAD : ISD::EXTLOAD;
   if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
     return SDValue();
 
   Align = NewAlign;
 
   SDValue NewPtr = OriginalLoad->getBasePtr();
   SDValue Offset;
   EVT PtrType = NewPtr.getValueType();
   MachinePointerInfo MPI;
   SDLoc DL(EVE);
   if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
     int Elt = ConstEltNo->getZExtValue();
     unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
     Offset = DAG.getConstant(PtrOff, DL, PtrType);
     MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
   } else {
     Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType);
     Offset = DAG.getNode(
         ISD::MUL, DL, PtrType, Offset,
         DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType));
     MPI = OriginalLoad->getPointerInfo();
   }
   NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, NewPtr, Offset);
 
   // The replacement we need to do here is a little tricky: we need to
   // replace an extractelement of a load with a load.
   // Use ReplaceAllUsesOfValuesWith to do the replacement.
   // Note that this replacement assumes that the extractvalue is the only
   // use of the load; that's okay because we don't want to perform this
   // transformation in other cases anyway.
   SDValue Load;
   SDValue Chain;
   if (ResultVT.bitsGT(VecEltVT)) {
     // If the result type of vextract is wider than the load, then issue an
     // extending load instead.
     ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT,
                                                   VecEltVT)
                                    ? ISD::ZEXTLOAD
                                    : ISD::EXTLOAD;
     Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,
                           OriginalLoad->getChain(), NewPtr, MPI, VecEltVT,
                           Align, OriginalLoad->getMemOperand()->getFlags(),
                           OriginalLoad->getAAInfo());
     Chain = Load.getValue(1);
   } else {
     Load = DAG.getLoad(VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr,
                        MPI, Align, OriginalLoad->getMemOperand()->getFlags(),
                        OriginalLoad->getAAInfo());
     Chain = Load.getValue(1);
     if (ResultVT.bitsLT(VecEltVT))
       Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
     else
       Load = DAG.getBitcast(ResultVT, Load);
   }
   WorklistRemover DeadNodes(*this);
   SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
   SDValue To[] = { Load, Chain };
   DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
   // Since we're explicitly calling ReplaceAllUses, add the new node to the
   // worklist explicitly as well.
   AddToWorklist(Load.getNode());
   AddUsersToWorklist(Load.getNode()); // Add users too
   // Make sure to revisit this node to clean it up; it will usually be dead.
   AddToWorklist(EVE);
   ++OpsNarrowed;
   return SDValue(EVE, 0);
 }
 
 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   // (vextract (scalar_to_vector val, 0) -> val
   SDValue InVec = N->getOperand(0);
   EVT VT = InVec.getValueType();
   EVT NVT = N->getValueType(0);
 
   if (InVec.isUndef())
     return DAG.getUNDEF(NVT);
 
   if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
     // Check if the result type doesn't match the inserted element type. A
     // SCALAR_TO_VECTOR may truncate the inserted element and the
     // EXTRACT_VECTOR_ELT may widen the extracted vector.
     SDValue InOp = InVec.getOperand(0);
     if (InOp.getValueType() != NVT) {
       assert(InOp.getValueType().isInteger() && NVT.isInteger());
       return DAG.getSExtOrTrunc(InOp, SDLoc(InVec), NVT);
     }
     return InOp;
   }
 
   SDValue EltNo = N->getOperand(1);
   ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
 
   // extract_vector_elt of out-of-bounds element -> UNDEF
   if (ConstEltNo && ConstEltNo->getAPIntValue().uge(VT.getVectorNumElements()))
     return DAG.getUNDEF(NVT);
 
   // extract_vector_elt (build_vector x, y), 1 -> y
   if (ConstEltNo &&
       InVec.getOpcode() == ISD::BUILD_VECTOR &&
       TLI.isTypeLegal(VT) &&
       (InVec.hasOneUse() ||
        TLI.aggressivelyPreferBuildVectorSources(VT))) {
     SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue());
     EVT InEltVT = Elt.getValueType();
 
     // Sometimes build_vector's scalar input types do not match result type.
     if (NVT == InEltVT)
       return Elt;
 
     // TODO: It may be useful to truncate if free if the build_vector implicitly
     // converts.
   }
 
   // extract_vector_elt (v2i32 (bitcast i64:x)), EltTrunc -> i32 (trunc i64:x)
   bool isLE = DAG.getDataLayout().isLittleEndian();
   unsigned EltTrunc = isLE ? 0 : VT.getVectorNumElements() - 1;
   if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() &&
       ConstEltNo->getZExtValue() == EltTrunc && VT.isInteger()) {
     SDValue BCSrc = InVec.getOperand(0);
     if (BCSrc.getValueType().isScalarInteger())
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
   }
 
   // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
   //
   // This only really matters if the index is non-constant since other combines
   // on the constant elements already work.
   if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT &&
       EltNo == InVec.getOperand(2)) {
     SDValue Elt = InVec.getOperand(1);
     return VT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, SDLoc(N), NVT) : Elt;
   }
 
   // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
   // We only perform this optimization before the op legalization phase because
   // we may introduce new vector instructions which are not backed by TD
   // patterns. For example on AVX, extracting elements from a wide vector
   // without using extract_subvector. However, if we can find an underlying
   // scalar value, then we can always use that.
   if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) {
     int NumElem = VT.getVectorNumElements();
     ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(InVec);
     // Find the new index to extract from.
     int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue());
 
     // Extracting an undef index is undef.
     if (OrigElt == -1)
       return DAG.getUNDEF(NVT);
 
     // Select the right vector half to extract from.
     SDValue SVInVec;
     if (OrigElt < NumElem) {
       SVInVec = InVec->getOperand(0);
     } else {
       SVInVec = InVec->getOperand(1);
       OrigElt -= NumElem;
     }
 
     if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) {
       SDValue InOp = SVInVec.getOperand(OrigElt);
       if (InOp.getValueType() != NVT) {
         assert(InOp.getValueType().isInteger() && NVT.isInteger());
         InOp = DAG.getSExtOrTrunc(InOp, SDLoc(SVInVec), NVT);
       }
 
       return InOp;
     }
 
     // FIXME: We should handle recursing on other vector shuffles and
     // scalar_to_vector here as well.
 
     if (!LegalOperations ||
         // FIXME: Should really be just isOperationLegalOrCustom.
         TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VT) ||
         TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VT)) {
       EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec,
                          DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy));
     }
   }
 
   // If only EXTRACT_VECTOR_ELT nodes use the source vector we can
   // simplify it based on the (valid) extraction indices.
   if (llvm::all_of(InVec->uses(), [&](SDNode *Use) {
         return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
                Use->getOperand(0) == InVec &&
                isa<ConstantSDNode>(Use->getOperand(1));
       })) {
     APInt DemandedElts = APInt::getNullValue(VT.getVectorNumElements());
     for (SDNode *Use : InVec->uses()) {
       auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1));
       if (CstElt->getAPIntValue().ult(VT.getVectorNumElements()))
         DemandedElts.setBit(CstElt->getZExtValue());
     }
     if (SimplifyDemandedVectorElts(InVec, DemandedElts, true))
       return SDValue(N, 0);
   }
 
   bool BCNumEltsChanged = false;
   EVT ExtVT = VT.getVectorElementType();
   EVT LVT = ExtVT;
 
   // If the result of load has to be truncated, then it's not necessarily
   // profitable.
   if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
     return SDValue();
 
   if (InVec.getOpcode() == ISD::BITCAST) {
     // Don't duplicate a load with other uses.
     if (!InVec.hasOneUse())
       return SDValue();
 
     EVT BCVT = InVec.getOperand(0).getValueType();
     if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
       return SDValue();
     if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
       BCNumEltsChanged = true;
     InVec = InVec.getOperand(0);
     ExtVT = BCVT.getVectorElementType();
   }
 
   // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size)
   if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() &&
       ISD::isNormalLoad(InVec.getNode()) &&
       !N->getOperand(1)->hasPredecessor(InVec.getNode())) {
     SDValue Index = N->getOperand(1);
     if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec)) {
       if (!OrigLoad->isVolatile()) {
         return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
                                                              OrigLoad);
       }
     }
   }
 
   // Perform only after legalization to ensure build_vector / vector_shuffle
   // optimizations have already been done.
   if (!LegalOperations) return SDValue();
 
   // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
   // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
   // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
 
   if (ConstEltNo) {
     int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
 
     LoadSDNode *LN0 = nullptr;
     const ShuffleVectorSDNode *SVN = nullptr;
     if (ISD::isNormalLoad(InVec.getNode())) {
       LN0 = cast<LoadSDNode>(InVec);
     } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR &&
                InVec.getOperand(0).getValueType() == ExtVT &&
                ISD::isNormalLoad(InVec.getOperand(0).getNode())) {
       // Don't duplicate a load with other uses.
       if (!InVec.hasOneUse())
         return SDValue();
 
       LN0 = cast<LoadSDNode>(InVec.getOperand(0));
     } else if ((SVN = dyn_cast<ShuffleVectorSDNode>(InVec))) {
       // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
       // =>
       // (load $addr+1*size)
 
       // Don't duplicate a load with other uses.
       if (!InVec.hasOneUse())
         return SDValue();
 
       // If the bit convert changed the number of elements, it is unsafe
       // to examine the mask.
       if (BCNumEltsChanged)
         return SDValue();
 
       // Select the input vector, guarding against out of range extract vector.
       unsigned NumElems = VT.getVectorNumElements();
       int Idx = (Elt > (int)NumElems) ? -1 : SVN->getMaskElt(Elt);
       InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1);
 
       if (InVec.getOpcode() == ISD::BITCAST) {
         // Don't duplicate a load with other uses.
         if (!InVec.hasOneUse())
           return SDValue();
 
         InVec = InVec.getOperand(0);
       }
       if (ISD::isNormalLoad(InVec.getNode())) {
         LN0 = cast<LoadSDNode>(InVec);
         Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
         EltNo = DAG.getConstant(Elt, SDLoc(EltNo), EltNo.getValueType());
       }
     }
 
     // Make sure we found a non-volatile load and the extractelement is
     // the only use.
     if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
       return SDValue();
 
     // If Idx was -1 above, Elt is going to be -1, so just return undef.
     if (Elt == -1)
       return DAG.getUNDEF(LVT);
 
     return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0);
   }
 
   return SDValue();
 }
 
 // Simplify (build_vec (ext )) to (bitcast (build_vec ))
 SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
   // We perform this optimization post type-legalization because
   // the type-legalizer often scalarizes integer-promoted vectors.
   // Performing this optimization before may create bit-casts which
   // will be type-legalized to complex code sequences.
   // We perform this optimization only before the operation legalizer because we
   // may introduce illegal operations.
   if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
     return SDValue();
 
   unsigned NumInScalars = N->getNumOperands();
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   // Check to see if this is a BUILD_VECTOR of a bunch of values
   // which come from any_extend or zero_extend nodes. If so, we can create
   // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
   // optimizations. We do not handle sign-extend because we can't fill the sign
   // using shuffles.
   EVT SourceType = MVT::Other;
   bool AllAnyExt = true;
 
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue In = N->getOperand(i);
     // Ignore undef inputs.
     if (In.isUndef()) continue;
 
     bool AnyExt  = In.getOpcode() == ISD::ANY_EXTEND;
     bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
 
     // Abort if the element is not an extension.
     if (!ZeroExt && !AnyExt) {
       SourceType = MVT::Other;
       break;
     }
 
     // The input is a ZeroExt or AnyExt. Check the original type.
     EVT InTy = In.getOperand(0).getValueType();
 
     // Check that all of the widened source types are the same.
     if (SourceType == MVT::Other)
       // First time.
       SourceType = InTy;
     else if (InTy != SourceType) {
       // Multiple income types. Abort.
       SourceType = MVT::Other;
       break;
     }
 
     // Check if all of the extends are ANY_EXTENDs.
     AllAnyExt &= AnyExt;
   }
 
   // In order to have valid types, all of the inputs must be extended from the
   // same source type and all of the inputs must be any or zero extend.
   // Scalar sizes must be a power of two.
   EVT OutScalarTy = VT.getScalarType();
   bool ValidTypes = SourceType != MVT::Other &&
                  isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
                  isPowerOf2_32(SourceType.getSizeInBits());
 
   // Create a new simpler BUILD_VECTOR sequence which other optimizations can
   // turn into a single shuffle instruction.
   if (!ValidTypes)
     return SDValue();
 
   bool isLE = DAG.getDataLayout().isLittleEndian();
   unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
   assert(ElemRatio > 1 && "Invalid element size ratio");
   SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
                                DAG.getConstant(0, DL, SourceType);
 
   unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
   SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
 
   // Populate the new build_vector
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     SDValue Cast = N->getOperand(i);
     assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
             Cast.getOpcode() == ISD::ZERO_EXTEND ||
             Cast.isUndef()) && "Invalid cast opcode");
     SDValue In;
     if (Cast.isUndef())
       In = DAG.getUNDEF(SourceType);
     else
       In = Cast->getOperand(0);
     unsigned Index = isLE ? (i * ElemRatio) :
                             (i * ElemRatio + (ElemRatio - 1));
 
     assert(Index < Ops.size() && "Invalid index");
     Ops[Index] = In;
   }
 
   // The type of the new BUILD_VECTOR node.
   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
   assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
          "Invalid vector size");
   // Check if the new vector type is legal.
   if (!isTypeLegal(VecVT) ||
       (!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) &&
        TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)))
     return SDValue();
 
   // Make the new BUILD_VECTOR.
   SDValue BV = DAG.getBuildVector(VecVT, DL, Ops);
 
   // The new BUILD_VECTOR node has the potential to be further optimized.
   AddToWorklist(BV.getNode());
   // Bitcast to the desired type.
   return DAG.getBitcast(VT, BV);
 }
 
 SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   unsigned NumInScalars = N->getNumOperands();
   SDLoc DL(N);
 
   EVT SrcVT = MVT::Other;
   unsigned Opcode = ISD::DELETED_NODE;
   unsigned NumDefs = 0;
 
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue In = N->getOperand(i);
     unsigned Opc = In.getOpcode();
 
     if (Opc == ISD::UNDEF)
       continue;
 
     // If all scalar values are floats and converted from integers.
     if (Opcode == ISD::DELETED_NODE &&
         (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
       Opcode = Opc;
     }
 
     if (Opc != Opcode)
       return SDValue();
 
     EVT InVT = In.getOperand(0).getValueType();
 
     // If all scalar values are typed differently, bail out. It's chosen to
     // simplify BUILD_VECTOR of integer types.
     if (SrcVT == MVT::Other)
       SrcVT = InVT;
     if (SrcVT != InVT)
       return SDValue();
     NumDefs++;
   }
 
   // If the vector has just one element defined, it's not worth to fold it into
   // a vectorized one.
   if (NumDefs < 2)
     return SDValue();
 
   assert((Opcode == ISD::UINT_TO_FP || Opcode == ISD::SINT_TO_FP)
          && "Should only handle conversion from integer to float.");
   assert(SrcVT != MVT::Other && "Cannot determine source type!");
 
   EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
 
   if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
     return SDValue();
 
   // Just because the floating-point vector type is legal does not necessarily
   // mean that the corresponding integer vector type is.
   if (!isTypeLegal(NVT))
     return SDValue();
 
   SmallVector<SDValue, 8> Opnds;
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue In = N->getOperand(i);
 
     if (In.isUndef())
       Opnds.push_back(DAG.getUNDEF(SrcVT));
     else
       Opnds.push_back(In.getOperand(0));
   }
   SDValue BV = DAG.getBuildVector(NVT, DL, Opnds);
   AddToWorklist(BV.getNode());
 
   return DAG.getNode(Opcode, DL, VT, BV);
 }
 
 SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                            ArrayRef<int> VectorMask,
                                            SDValue VecIn1, SDValue VecIn2,
                                            unsigned LeftIdx) {
   MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
   SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy);
 
   EVT VT = N->getValueType(0);
   EVT InVT1 = VecIn1.getValueType();
   EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;
 
   unsigned Vec2Offset = 0;
   unsigned NumElems = VT.getVectorNumElements();
   unsigned ShuffleNumElems = NumElems;
 
   // In case both the input vectors are extracted from same base
   // vector we do not need extra addend (Vec2Offset) while
   // computing shuffle mask.
   if (!VecIn2 || !(VecIn1.getOpcode() == ISD::EXTRACT_SUBVECTOR) ||
       !(VecIn2.getOpcode() == ISD::EXTRACT_SUBVECTOR) ||
       !(VecIn1.getOperand(0) == VecIn2.getOperand(0)))
     Vec2Offset = InVT1.getVectorNumElements();
 
   // We can't generate a shuffle node with mismatched input and output types.
   // Try to make the types match the type of the output.
   if (InVT1 != VT || InVT2 != VT) {
     if ((VT.getSizeInBits() % InVT1.getSizeInBits() == 0) && InVT1 == InVT2) {
       // If the output vector length is a multiple of both input lengths,
       // we can concatenate them and pad the rest with undefs.
       unsigned NumConcats = VT.getSizeInBits() / InVT1.getSizeInBits();
       assert(NumConcats >= 2 && "Concat needs at least two inputs!");
       SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1));
       ConcatOps[0] = VecIn1;
       ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1);
       VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
       VecIn2 = SDValue();
     } else if (InVT1.getSizeInBits() == VT.getSizeInBits() * 2) {
       if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
         return SDValue();
 
       if (!VecIn2.getNode()) {
         // If we only have one input vector, and it's twice the size of the
         // output, split it in two.
         VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1,
                              DAG.getConstant(NumElems, DL, IdxTy));
         VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx);
         // Since we now have shorter input vectors, adjust the offset of the
         // second vector's start.
         Vec2Offset = NumElems;
       } else if (InVT2.getSizeInBits() <= InVT1.getSizeInBits()) {
         // VecIn1 is wider than the output, and we have another, possibly
         // smaller input. Pad the smaller input with undefs, shuffle at the
         // input vector width, and extract the output.
         // The shuffle type is different than VT, so check legality again.
         if (LegalOperations &&
             !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
           return SDValue();
 
         // Legalizing INSERT_SUBVECTOR is tricky - you basically have to
         // lower it back into a BUILD_VECTOR. So if the inserted type is
         // illegal, don't even try.
         if (InVT1 != InVT2) {
           if (!TLI.isTypeLegal(InVT2))
             return SDValue();
           VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1,
                                DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
         }
         ShuffleNumElems = NumElems * 2;
       } else {
         // Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider
         // than VecIn1. We can't handle this for now - this case will disappear
         // when we start sorting the vectors by type.
         return SDValue();
       }
     } else if (InVT2.getSizeInBits() * 2 == VT.getSizeInBits() &&
                InVT1.getSizeInBits() == VT.getSizeInBits()) {
       SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
       ConcatOps[0] = VecIn2;
       VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
     } else {
       // TODO: Support cases where the length mismatch isn't exactly by a
       // factor of 2.
       // TODO: Move this check upwards, so that if we have bad type
       // mismatches, we don't create any DAG nodes.
       return SDValue();
     }
   }
 
   // Initialize mask to undef.
   SmallVector<int, 8> Mask(ShuffleNumElems, -1);
 
   // Only need to run up to the number of elements actually used, not the
   // total number of elements in the shuffle - if we are shuffling a wider
   // vector, the high lanes should be set to undef.
   for (unsigned i = 0; i != NumElems; ++i) {
     if (VectorMask[i] <= 0)
       continue;
 
     unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1);
     if (VectorMask[i] == (int)LeftIdx) {
       Mask[i] = ExtIndex;
     } else if (VectorMask[i] == (int)LeftIdx + 1) {
       Mask[i] = Vec2Offset + ExtIndex;
     }
   }
 
   // The type the input vectors may have changed above.
   InVT1 = VecIn1.getValueType();
 
   // If we already have a VecIn2, it should have the same type as VecIn1.
   // If we don't, get an undef/zero vector of the appropriate type.
   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1);
   assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type.");
 
   SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask);
   if (ShuffleNumElems > NumElems)
     Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx);
 
   return Shuffle;
 }
 
 // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
 // operations. If the types of the vectors we're extracting from allow it,
 // turn this into a vector_shuffle node.
 SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
   if (!isTypeLegal(VT))
     return SDValue();
 
   // May only combine to shuffle after legalize if shuffle is legal.
   if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
     return SDValue();
 
   bool UsesZeroVector = false;
   unsigned NumElems = N->getNumOperands();
 
   // Record, for each element of the newly built vector, which input vector
   // that element comes from. -1 stands for undef, 0 for the zero vector,
   // and positive values for the input vectors.
   // VectorMask maps each element to its vector number, and VecIn maps vector
   // numbers to their initial SDValues.
 
   SmallVector<int, 8> VectorMask(NumElems, -1);
   SmallVector<SDValue, 8> VecIn;
   VecIn.push_back(SDValue());
 
   for (unsigned i = 0; i != NumElems; ++i) {
     SDValue Op = N->getOperand(i);
 
     if (Op.isUndef())
       continue;
 
     // See if we can use a blend with a zero vector.
     // TODO: Should we generalize this to a blend with an arbitrary constant
     // vector?
     if (isNullConstant(Op) || isNullFPConstant(Op)) {
       UsesZeroVector = true;
       VectorMask[i] = 0;
       continue;
     }
 
     // Not an undef or zero. If the input is something other than an
     // EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
     if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         !isa<ConstantSDNode>(Op.getOperand(1)))
       return SDValue();
     SDValue ExtractedFromVec = Op.getOperand(0);
 
     APInt ExtractIdx = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
     if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
       return SDValue();
 
     // All inputs must have the same element type as the output.
     if (VT.getVectorElementType() !=
         ExtractedFromVec.getValueType().getVectorElementType())
       return SDValue();
 
     // Have we seen this input vector before?
     // The vectors are expected to be tiny (usually 1 or 2 elements), so using
     // a map back from SDValues to numbers isn't worth it.
     unsigned Idx = std::distance(
         VecIn.begin(), std::find(VecIn.begin(), VecIn.end(), ExtractedFromVec));
     if (Idx == VecIn.size())
       VecIn.push_back(ExtractedFromVec);
 
     VectorMask[i] = Idx;
   }
 
   // If we didn't find at least one input vector, bail out.
   if (VecIn.size() < 2)
     return SDValue();
 
   // If all the Operands of BUILD_VECTOR extract from same
   // vector, then split the vector efficiently based on the maximum
   // vector access index and adjust the VectorMask and
   // VecIn accordingly.
   if (VecIn.size() == 2) {
     unsigned MaxIndex = 0;
     unsigned NearestPow2 = 0;
     SDValue Vec = VecIn.back();
     EVT InVT = Vec.getValueType();
     MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
     SmallVector<unsigned, 8> IndexVec(NumElems, 0);
 
     for (unsigned i = 0; i < NumElems; i++) {
       if (VectorMask[i] <= 0)
         continue;
       unsigned Index = N->getOperand(i).getConstantOperandVal(1);
       IndexVec[i] = Index;
       MaxIndex = std::max(MaxIndex, Index);
     }
 
     NearestPow2 = PowerOf2Ceil(MaxIndex);
     if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 &&
         NumElems * 2 < NearestPow2) {
       unsigned SplitSize = NearestPow2 / 2;
       EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
                                      InVT.getVectorElementType(), SplitSize);
       if (TLI.isTypeLegal(SplitVT)) {
         SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
                                      DAG.getConstant(SplitSize, DL, IdxTy));
         SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
                                      DAG.getConstant(0, DL, IdxTy));
         VecIn.pop_back();
         VecIn.push_back(VecIn1);
         VecIn.push_back(VecIn2);
 
         for (unsigned i = 0; i < NumElems; i++) {
           if (VectorMask[i] <= 0)
             continue;
           VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2;
         }
       }
     }
   }
 
   // TODO: We want to sort the vectors by descending length, so that adjacent
   // pairs have similar length, and the longer vector is always first in the
   // pair.
 
   // TODO: Should this fire if some of the input vectors has illegal type (like
   // it does now), or should we let legalization run its course first?
 
   // Shuffle phase:
   // Take pairs of vectors, and shuffle them so that the result has elements
   // from these vectors in the correct places.
   // For example, given:
   // t10: i32 = extract_vector_elt t1, Constant:i64<0>
   // t11: i32 = extract_vector_elt t2, Constant:i64<0>
   // t12: i32 = extract_vector_elt t3, Constant:i64<0>
   // t13: i32 = extract_vector_elt t1, Constant:i64<1>
   // t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13
   // We will generate:
   // t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2
   // t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef
   SmallVector<SDValue, 4> Shuffles;
   for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) {
     unsigned LeftIdx = 2 * In + 1;
     SDValue VecLeft = VecIn[LeftIdx];
     SDValue VecRight =
         (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue();
 
     if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft,
                                                 VecRight, LeftIdx))
       Shuffles.push_back(Shuffle);
     else
       return SDValue();
   }
 
   // If we need the zero vector as an "ingredient" in the blend tree, add it
   // to the list of shuffles.
   if (UsesZeroVector)
     Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT)
                                       : DAG.getConstantFP(0.0, DL, VT));
 
   // If we only have one shuffle, we're done.
   if (Shuffles.size() == 1)
     return Shuffles[0];
 
   // Update the vector mask to point to the post-shuffle vectors.
   for (int &Vec : VectorMask)
     if (Vec == 0)
       Vec = Shuffles.size() - 1;
     else
       Vec = (Vec - 1) / 2;
 
   // More than one shuffle. Generate a binary tree of blends, e.g. if from
   // the previous step we got the set of shuffles t10, t11, t12, t13, we will
   // generate:
   // t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2
   // t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4
   // t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6
   // t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8
   // t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11
   // t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13
   // t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21
 
   // Make sure the initial size of the shuffle list is even.
   if (Shuffles.size() % 2)
     Shuffles.push_back(DAG.getUNDEF(VT));
 
   for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) {
     if (CurSize % 2) {
       Shuffles[CurSize] = DAG.getUNDEF(VT);
       CurSize++;
     }
     for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) {
       int Left = 2 * In;
       int Right = 2 * In + 1;
       SmallVector<int, 8> Mask(NumElems, -1);
       for (unsigned i = 0; i != NumElems; ++i) {
         if (VectorMask[i] == Left) {
           Mask[i] = i;
           VectorMask[i] = In;
         } else if (VectorMask[i] == Right) {
           Mask[i] = i + NumElems;
           VectorMask[i] = In;
         }
       }
 
       Shuffles[In] =
           DAG.getVectorShuffle(VT, DL, Shuffles[Left], Shuffles[Right], Mask);
     }
   }
   return Shuffles[0];
 }
 
 // Try to turn a build vector of zero extends of extract vector elts into a
 // a vector zero extend and possibly an extract subvector.
 // TODO: Support sign extend or any extend?
 // TODO: Allow undef elements?
 // TODO: Don't require the extracts to start at element 0.
 SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
   if (LegalOperations)
     return SDValue();
 
   EVT VT = N->getValueType(0);
 
   SDValue Op0 = N->getOperand(0);
   auto checkElem = [&](SDValue Op) -> int64_t {
     if (Op.getOpcode() == ISD::ZERO_EXTEND &&
         Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0))
       if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1)))
         return C->getZExtValue();
     return -1;
   };
 
   // Make sure the first element matches
   // (zext (extract_vector_elt X, C))
   int64_t Offset = checkElem(Op0);
   if (Offset < 0)
     return SDValue();
 
   unsigned NumElems = N->getNumOperands();
   SDValue In = Op0.getOperand(0).getOperand(0);
   EVT InSVT = In.getValueType().getScalarType();
   EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems);
 
   // Don't create an illegal input type after type legalization.
   if (LegalTypes && !TLI.isTypeLegal(InVT))
     return SDValue();
 
   // Ensure all the elements come from the same vector and are adjacent.
   for (unsigned i = 1; i != NumElems; ++i) {
     if ((Offset + i) != checkElem(N->getOperand(i)))
       return SDValue();
   }
 
   SDLoc DL(N);
   In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In,
                    Op0.getOperand(0).getOperand(1));
   return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, In);
 }
 
 SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // A vector built entirely of undefs is undef.
   if (ISD::allOperandsUndef(N))
     return DAG.getUNDEF(VT);
 
   // If this is a splat of a bitcast from another vector, change to a
   // concat_vector.
   // For example:
   //   (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) ->
   //     (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X))))
   //
   // If X is a build_vector itself, the concat can become a larger build_vector.
   // TODO: Maybe this is useful for non-splat too?
   if (!LegalOperations) {
     if (SDValue Splat = cast<BuildVectorSDNode>(N)->getSplatValue()) {
       Splat = peekThroughBitcast(Splat);
       EVT SrcVT = Splat.getValueType();
       if (SrcVT.isVector()) {
         unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements();
         EVT NewVT = EVT::getVectorVT(*DAG.getContext(),
                                      SrcVT.getVectorElementType(), NumElts);
         if (!LegalTypes || TLI.isTypeLegal(NewVT)) {
           SmallVector<SDValue, 8> Ops(N->getNumOperands(), Splat);
           SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N),
                                        NewVT, Ops);
           return DAG.getBitcast(VT, Concat);
         }
       }
     }
   }
 
   // Check if we can express BUILD VECTOR via subvector extract.
   if (!LegalTypes && (N->getNumOperands() > 1)) {
     SDValue Op0 = N->getOperand(0);
     auto checkElem = [&](SDValue Op) -> uint64_t {
       if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
           (Op0.getOperand(0) == Op.getOperand(0)))
         if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
           return CNode->getZExtValue();
       return -1;
     };
 
     int Offset = checkElem(Op0);
     for (unsigned i = 0; i < N->getNumOperands(); ++i) {
       if (Offset + i != checkElem(N->getOperand(i))) {
         Offset = -1;
         break;
       }
     }
 
     if ((Offset == 0) &&
         (Op0.getOperand(0).getValueType() == N->getValueType(0)))
       return Op0.getOperand(0);
     if ((Offset != -1) &&
         ((Offset % N->getValueType(0).getVectorNumElements()) ==
          0)) // IDX must be multiple of output size.
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
                          Op0.getOperand(0), Op0.getOperand(1));
   }
 
   if (SDValue V = convertBuildVecZextToZext(N))
     return V;
 
   if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
     return V;
 
   if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N))
     return V;
 
   if (SDValue V = reduceBuildVecToShuffle(N))
     return V;
 
   return SDValue();
 }
 
 static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT OpVT = N->getOperand(0).getValueType();
 
   // If the operands are legal vectors, leave them alone.
   if (TLI.isTypeLegal(OpVT))
     return SDValue();
 
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SmallVector<SDValue, 8> Ops;
 
   EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
   SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
 
   // Keep track of what we encounter.
   bool AnyInteger = false;
   bool AnyFP = false;
   for (const SDValue &Op : N->ops()) {
     if (ISD::BITCAST == Op.getOpcode() &&
         !Op.getOperand(0).getValueType().isVector())
       Ops.push_back(Op.getOperand(0));
     else if (ISD::UNDEF == Op.getOpcode())
       Ops.push_back(ScalarUndef);
     else
       return SDValue();
 
     // Note whether we encounter an integer or floating point scalar.
     // If it's neither, bail out, it could be something weird like x86mmx.
     EVT LastOpVT = Ops.back().getValueType();
     if (LastOpVT.isFloatingPoint())
       AnyFP = true;
     else if (LastOpVT.isInteger())
       AnyInteger = true;
     else
       return SDValue();
   }
 
   // If any of the operands is a floating point scalar bitcast to a vector,
   // use floating point types throughout, and bitcast everything.
   // Replace UNDEFs by another scalar UNDEF node, of the final desired type.
   if (AnyFP) {
     SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
     ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
     if (AnyInteger) {
       for (SDValue &Op : Ops) {
         if (Op.getValueType() == SVT)
           continue;
         if (Op.isUndef())
           Op = ScalarUndef;
         else
           Op = DAG.getBitcast(SVT, Op);
       }
     }
   }
 
   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
                                VT.getSizeInBits() / SVT.getSizeInBits());
   return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
 }
 
 // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
 // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
 // most two distinct vectors the same size as the result, attempt to turn this
 // into a legal shuffle.
 static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   EVT OpVT = N->getOperand(0).getValueType();
   int NumElts = VT.getVectorNumElements();
   int NumOpElts = OpVT.getVectorNumElements();
 
   SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
   SmallVector<int, 8> Mask;
 
   for (SDValue Op : N->ops()) {
     // Peek through any bitcast.
     Op = peekThroughBitcast(Op);
 
     // UNDEF nodes convert to UNDEF shuffle mask values.
     if (Op.isUndef()) {
       Mask.append((unsigned)NumOpElts, -1);
       continue;
     }
 
     if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
       return SDValue();
 
     // What vector are we extracting the subvector from and at what index?
     SDValue ExtVec = Op.getOperand(0);
 
     // We want the EVT of the original extraction to correctly scale the
     // extraction index.
     EVT ExtVT = ExtVec.getValueType();
 
     // Peek through any bitcast.
     ExtVec = peekThroughBitcast(ExtVec);
 
     // UNDEF nodes convert to UNDEF shuffle mask values.
     if (ExtVec.isUndef()) {
       Mask.append((unsigned)NumOpElts, -1);
       continue;
     }
 
     if (!isa<ConstantSDNode>(Op.getOperand(1)))
       return SDValue();
     int ExtIdx = Op.getConstantOperandVal(1);
 
     // Ensure that we are extracting a subvector from a vector the same
     // size as the result.
     if (ExtVT.getSizeInBits() != VT.getSizeInBits())
       return SDValue();
 
     // Scale the subvector index to account for any bitcast.
     int NumExtElts = ExtVT.getVectorNumElements();
     if (0 == (NumExtElts % NumElts))
       ExtIdx /= (NumExtElts / NumElts);
     else if (0 == (NumElts % NumExtElts))
       ExtIdx *= (NumElts / NumExtElts);
     else
       return SDValue();
 
     // At most we can reference 2 inputs in the final shuffle.
     if (SV0.isUndef() || SV0 == ExtVec) {
       SV0 = ExtVec;
       for (int i = 0; i != NumOpElts; ++i)
         Mask.push_back(i + ExtIdx);
     } else if (SV1.isUndef() || SV1 == ExtVec) {
       SV1 = ExtVec;
       for (int i = 0; i != NumOpElts; ++i)
         Mask.push_back(i + ExtIdx + NumElts);
     } else {
       return SDValue();
     }
   }
 
   if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT))
     return SDValue();
 
   return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
                               DAG.getBitcast(VT, SV1), Mask);
 }
 
 SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   // If we only have one input vector, we don't need to do any concatenation.
   if (N->getNumOperands() == 1)
     return N->getOperand(0);
 
   // Check if all of the operands are undefs.
   EVT VT = N->getValueType(0);
   if (ISD::allOperandsUndef(N))
     return DAG.getUNDEF(VT);
 
   // Optimize concat_vectors where all but the first of the vectors are undef.
   if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) {
         return Op.isUndef();
       })) {
     SDValue In = N->getOperand(0);
     assert(In.getValueType().isVector() && "Must concat vectors");
 
     // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr).
     if (In->getOpcode() == ISD::BITCAST &&
         !In->getOperand(0).getValueType().isVector()) {
       SDValue Scalar = In->getOperand(0);
 
       // If the bitcast type isn't legal, it might be a trunc of a legal type;
       // look through the trunc so we can still do the transform:
       //   concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
       if (Scalar->getOpcode() == ISD::TRUNCATE &&
           !TLI.isTypeLegal(Scalar.getValueType()) &&
           TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
         Scalar = Scalar->getOperand(0);
 
       EVT SclTy = Scalar->getValueType(0);
 
       if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
         return SDValue();
 
       // Bail out if the vector size is not a multiple of the scalar size.
       if (VT.getSizeInBits() % SclTy.getSizeInBits())
         return SDValue();
 
       unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
       if (VNTNumElms < 2)
         return SDValue();
 
       EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
       if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
         return SDValue();
 
       SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar);
       return DAG.getBitcast(VT, Res);
     }
   }
 
   // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
   // We have already tested above for an UNDEF only concatenation.
   // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
   // -> (BUILD_VECTOR A, B, ..., C, D, ...)
   auto IsBuildVectorOrUndef = [](const SDValue &Op) {
     return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode();
   };
   if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) {
     SmallVector<SDValue, 8> Opnds;
     EVT SVT = VT.getScalarType();
 
     EVT MinVT = SVT;
     if (!SVT.isFloatingPoint()) {
       // If BUILD_VECTOR are from built from integer, they may have different
       // operand types. Get the smallest type and truncate all operands to it.
       bool FoundMinVT = false;
       for (const SDValue &Op : N->ops())
         if (ISD::BUILD_VECTOR == Op.getOpcode()) {
           EVT OpSVT = Op.getOperand(0).getValueType();
           MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
           FoundMinVT = true;
         }
       assert(FoundMinVT && "Concat vector type mismatch");
     }
 
     for (const SDValue &Op : N->ops()) {
       EVT OpVT = Op.getValueType();
       unsigned NumElts = OpVT.getVectorNumElements();
 
       if (ISD::UNDEF == Op.getOpcode())
         Opnds.append(NumElts, DAG.getUNDEF(MinVT));
 
       if (ISD::BUILD_VECTOR == Op.getOpcode()) {
         if (SVT.isFloatingPoint()) {
           assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
           Opnds.append(Op->op_begin(), Op->op_begin() + NumElts);
         } else {
           for (unsigned i = 0; i != NumElts; ++i)
             Opnds.push_back(
                 DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
         }
       }
     }
 
     assert(VT.getVectorNumElements() == Opnds.size() &&
            "Concat vector type mismatch");
     return DAG.getBuildVector(VT, SDLoc(N), Opnds);
   }
 
   // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
   if (SDValue V = combineConcatVectorOfScalars(N, DAG))
     return V;
 
   // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
   if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
     if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
       return V;
 
   // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
   // nodes often generate nop CONCAT_VECTOR nodes.
   // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
   // place the incoming vectors at the exact same location.
   SDValue SingleSource = SDValue();
   unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements();
 
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     SDValue Op = N->getOperand(i);
 
     if (Op.isUndef())
       continue;
 
     // Check if this is the identity extract:
     if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
       return SDValue();
 
     // Find the single incoming vector for the extract_subvector.
     if (SingleSource.getNode()) {
       if (Op.getOperand(0) != SingleSource)
         return SDValue();
     } else {
       SingleSource = Op.getOperand(0);
 
       // Check the source type is the same as the type of the result.
       // If not, this concat may extend the vector, so we can not
       // optimize it away.
       if (SingleSource.getValueType() != N->getValueType(0))
         return SDValue();
     }
 
     unsigned IdentityIndex = i * PartNumElem;
     ConstantSDNode *CS = dyn_cast<ConstantSDNode>(Op.getOperand(1));
     // The extract index must be constant.
     if (!CS)
       return SDValue();
 
     // Check that we are reading from the identity index.
     if (CS->getZExtValue() != IdentityIndex)
       return SDValue();
   }
 
   if (SingleSource.getNode())
     return SingleSource;
 
   return SDValue();
 }
 
 /// If we are extracting a subvector produced by a wide binary operator with at
 /// at least one operand that was the result of a vector concatenation, then try
 /// to use the narrow vector operands directly to avoid the concatenation and
 /// extraction.
 static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
   // some of these bailouts with other transforms.
 
   // The extract index must be a constant, so we can map it to a concat operand.
   auto *ExtractIndex = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
   if (!ExtractIndex)
     return SDValue();
 
   // Only handle the case where we are doubling and then halving. A larger ratio
   // may require more than two narrow binops to replace the wide binop.
   EVT VT = Extract->getValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   assert((ExtractIndex->getZExtValue() % NumElems) == 0 &&
          "Extract index is not a multiple of the vector length.");
   if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
     return SDValue();
 
   // We are looking for an optionally bitcasted wide vector binary operator
   // feeding an extract subvector.
   SDValue BinOp = peekThroughBitcast(Extract->getOperand(0));
 
   // TODO: The motivating case for this transform is an x86 AVX1 target. That
   // target has temptingly almost legal versions of bitwise logic ops in 256-bit
   // flavors, but no other 256-bit integer support. This could be extended to
   // handle any binop, but that may require fixing/adding other folds to avoid
   // codegen regressions.
   unsigned BOpcode = BinOp.getOpcode();
   if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
     return SDValue();
 
   // The binop must be a vector type, so we can chop it in half.
   EVT WideBVT = BinOp.getValueType();
   if (!WideBVT.isVector())
     return SDValue();
 
   // Bail out if the target does not support a narrower version of the binop.
   EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
                                    WideBVT.getVectorNumElements() / 2);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
     return SDValue();
 
   // Peek through bitcasts of the binary operator operands if needed.
   SDValue LHS = peekThroughBitcast(BinOp.getOperand(0));
   SDValue RHS = peekThroughBitcast(BinOp.getOperand(1));
 
   // We need at least one concatenation operation of a binop operand to make
   // this transform worthwhile. The concat must double the input vector sizes.
   // TODO: Should we also handle INSERT_SUBVECTOR patterns?
   bool ConcatL =
       LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2;
   bool ConcatR =
       RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2;
   if (!ConcatL && !ConcatR)
     return SDValue();
 
   // If one of the binop operands was not the result of a concat, we must
   // extract a half-sized operand for our new narrow binop. We can't just reuse
   // the original extract index operand because we may have bitcasted.
   unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems;
   unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
   EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();
   SDLoc DL(Extract);
 
   // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
   // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N)
   // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN
   SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum))
                       : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
                                     BinOp.getOperand(0),
                                     DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT));
 
   SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum))
                       : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
                                     BinOp.getOperand(1),
                                     DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT));
 
   SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y);
   return DAG.getBitcast(VT, NarrowBinOp);
 }
 
 /// If we are extracting a subvector from a wide vector load, convert to a
 /// narrow load to eliminate the extraction:
 /// (extract_subvector (load wide vector)) --> (load narrow vector)
 static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
   // TODO: Add support for big-endian. The offset calculation must be adjusted.
   if (DAG.getDataLayout().isBigEndian())
     return SDValue();
 
   // TODO: The one-use check is overly conservative. Check the cost of the
   // extract instead or remove that condition entirely.
   auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
   auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
   if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() ||
       !ExtIdx)
     return SDValue();
 
   // The narrow load will be offset from the base address of the old load if
   // we are extracting from something besides index 0 (little-endian).
   EVT VT = Extract->getValueType(0);
   SDLoc DL(Extract);
   SDValue BaseAddr = Ld->getOperand(1);
   unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize();
 
   // TODO: Use "BaseIndexOffset" to make this more effective.
   SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
   MachineFunction &MF = DAG.getMachineFunction();
   MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset,
                                                    VT.getStoreSize());
   SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
   DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
   return NewLd;
 }
 
 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
   EVT NVT = N->getValueType(0);
   SDValue V = N->getOperand(0);
 
   // Extract from UNDEF is UNDEF.
   if (V.isUndef())
     return DAG.getUNDEF(NVT);
 
   if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
     if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
       return NarrowLoad;
 
   // Combine:
   //    (extract_subvec (concat V1, V2, ...), i)
   // Into:
   //    Vi if possible
   // Only operand 0 is checked as 'concat' assumes all inputs of the same
   // type.
   if (V->getOpcode() == ISD::CONCAT_VECTORS &&
       isa<ConstantSDNode>(N->getOperand(1)) &&
       V->getOperand(0).getValueType() == NVT) {
     unsigned Idx = N->getConstantOperandVal(1);
     unsigned NumElems = NVT.getVectorNumElements();
     assert((Idx % NumElems) == 0 &&
            "IDX in concat is not a multiple of the result vector length.");
     return V->getOperand(Idx / NumElems);
   }
 
   // Skip bitcasting
   V = peekThroughBitcast(V);
 
   // If the input is a build vector. Try to make a smaller build vector.
   if (V->getOpcode() == ISD::BUILD_VECTOR) {
     if (auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
       EVT InVT = V->getValueType(0);
       unsigned ExtractSize = NVT.getSizeInBits();
       unsigned EltSize = InVT.getScalarSizeInBits();
       // Only do this if we won't split any elements.
       if (ExtractSize % EltSize == 0) {
         unsigned NumElems = ExtractSize / EltSize;
         EVT EltVT = InVT.getVectorElementType();
         EVT ExtractVT = NumElems == 1 ? EltVT :
           EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems);
         if ((Level < AfterLegalizeDAG ||
              (NumElems == 1 ||
               TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&
             (!LegalTypes || TLI.isTypeLegal(ExtractVT))) {
           unsigned IdxVal = (Idx->getZExtValue() * NVT.getScalarSizeInBits()) /
                             EltSize;
           if (NumElems == 1) {
             SDValue Src = V->getOperand(IdxVal);
             if (EltVT != Src.getValueType())
               Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src);
 
             return DAG.getBitcast(NVT, Src);
           }
 
           // Extract the pieces from the original build_vector.
           SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
                                             makeArrayRef(V->op_begin() + IdxVal,
                                                          NumElems));
           return DAG.getBitcast(NVT, BuildVec);
         }
       }
     }
   }
 
   if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
     // Handle only simple case where vector being inserted and vector
     // being extracted are of same size.
     EVT SmallVT = V->getOperand(1).getValueType();
     if (!NVT.bitsEq(SmallVT))
       return SDValue();
 
     // Only handle cases where both indexes are constants.
     ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
     ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));
 
     if (InsIdx && ExtIdx) {
       // Combine:
       //    (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
       // Into:
       //    indices are equal or bit offsets are equal => V1
       //    otherwise => (extract_subvec V1, ExtIdx)
       if (InsIdx->getZExtValue() * SmallVT.getScalarSizeInBits() ==
           ExtIdx->getZExtValue() * NVT.getScalarSizeInBits())
         return DAG.getBitcast(NVT, V->getOperand(1));
       return DAG.getNode(
           ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
           DAG.getBitcast(N->getOperand(0).getValueType(), V->getOperand(0)),
           N->getOperand(1));
     }
   }
 
   if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))
     return NarrowBOp;
 
   if (SimplifyDemandedVectorElts(SDValue(N, 0)))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
 // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
 // or turn a shuffle of a single concat into simpler shuffle then concat.
 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
 
   SmallVector<SDValue, 4> Ops;
   EVT ConcatVT = N0.getOperand(0).getValueType();
   unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
   unsigned NumConcats = NumElts / NumElemsPerConcat;
 
   // Special case: shuffle(concat(A,B)) can be more efficiently represented
   // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
   // half vector elements.
   if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() &&
       std::all_of(SVN->getMask().begin() + NumElemsPerConcat,
                   SVN->getMask().end(), [](int i) { return i == -1; })) {
     N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1),
                               makeArrayRef(SVN->getMask().begin(), NumElemsPerConcat));
     N1 = DAG.getUNDEF(ConcatVT);
     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
   }
 
   // Look at every vector that's inserted. We're looking for exact
   // subvector-sized copies from a concatenated vector
   for (unsigned I = 0; I != NumConcats; ++I) {
     // Make sure we're dealing with a copy.
     unsigned Begin = I * NumElemsPerConcat;
     bool AllUndef = true, NoUndef = true;
     for (unsigned J = Begin; J != Begin + NumElemsPerConcat; ++J) {
       if (SVN->getMaskElt(J) >= 0)
         AllUndef = false;
       else
         NoUndef = false;
     }
 
     if (NoUndef) {
       if (SVN->getMaskElt(Begin) % NumElemsPerConcat != 0)
         return SDValue();
 
       for (unsigned J = 1; J != NumElemsPerConcat; ++J)
         if (SVN->getMaskElt(Begin + J - 1) + 1 != SVN->getMaskElt(Begin + J))
           return SDValue();
 
       unsigned FirstElt = SVN->getMaskElt(Begin) / NumElemsPerConcat;
       if (FirstElt < N0.getNumOperands())
         Ops.push_back(N0.getOperand(FirstElt));
       else
         Ops.push_back(N1.getOperand(FirstElt - N0.getNumOperands()));
 
     } else if (AllUndef) {
       Ops.push_back(DAG.getUNDEF(N0.getOperand(0).getValueType()));
     } else { // Mixed with general masks and undefs, can't do optimization.
       return SDValue();
     }
   }
 
   return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
 }
 
 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
 //
 // SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always
 // a simplification in some sense, but it isn't appropriate in general: some
 // BUILD_VECTORs are substantially cheaper than others. The general case
 // of a BUILD_VECTOR requires inserting each element individually (or
 // performing the equivalent in a temporary stack variable). A BUILD_VECTOR of
 // all constants is a single constant pool load.  A BUILD_VECTOR where each
 // element is identical is a splat.  A BUILD_VECTOR where most of the operands
 // are undef lowers to a small number of element insertions.
 //
 // To deal with this, we currently use a bunch of mostly arbitrary heuristics.
 // We don't fold shuffles where one side is a non-zero constant, and we don't
 // fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate
 // non-constant operands. This seems to work out reasonably well in practice.
 static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
                                        SelectionDAG &DAG,
                                        const TargetLowering &TLI) {
   EVT VT = SVN->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
   SDValue N0 = SVN->getOperand(0);
   SDValue N1 = SVN->getOperand(1);
 
   if (!N0->hasOneUse() || !N1->hasOneUse())
     return SDValue();
 
   // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
   // discussed above.
   if (!N1.isUndef()) {
     bool N0AnyConst = isAnyConstantBuildVector(N0.getNode());
     bool N1AnyConst = isAnyConstantBuildVector(N1.getNode());
     if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode()))
       return SDValue();
     if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode()))
       return SDValue();
   }
 
   // If both inputs are splats of the same value then we can safely merge this
   // to a single BUILD_VECTOR with undef elements based on the shuffle mask.
   bool IsSplat = false;
   auto *BV0 = dyn_cast<BuildVectorSDNode>(N0);
   auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
   if (BV0 && BV1)
     if (SDValue Splat0 = BV0->getSplatValue())
       IsSplat = (Splat0 == BV1->getSplatValue());
 
   SmallVector<SDValue, 8> Ops;
   SmallSet<SDValue, 16> DuplicateOps;
   for (int M : SVN->getMask()) {
     SDValue Op = DAG.getUNDEF(VT.getScalarType());
     if (M >= 0) {
       int Idx = M < (int)NumElts ? M : M - NumElts;
       SDValue &S = (M < (int)NumElts ? N0 : N1);
       if (S.getOpcode() == ISD::BUILD_VECTOR) {
         Op = S.getOperand(Idx);
       } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) {
         assert(Idx == 0 && "Unexpected SCALAR_TO_VECTOR operand index.");
         Op = S.getOperand(0);
       } else {
         // Operand can't be combined - bail out.
         return SDValue();
       }
     }
 
     // Don't duplicate a non-constant BUILD_VECTOR operand unless we're
     // generating a splat; semantically, this is fine, but it's likely to
     // generate low-quality code if the target can't reconstruct an appropriate
     // shuffle.
     if (!Op.isUndef() && !isa<ConstantSDNode>(Op) && !isa<ConstantFPSDNode>(Op))
       if (!IsSplat && !DuplicateOps.insert(Op).second)
         return SDValue();
 
     Ops.push_back(Op);
   }
 
   // BUILD_VECTOR requires all inputs to be of the same type, find the
   // maximum type and extend them all.
   EVT SVT = VT.getScalarType();
   if (SVT.isInteger())
     for (SDValue &Op : Ops)
       SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
   if (SVT != VT.getScalarType())
     for (SDValue &Op : Ops)
       Op = TLI.isZExtFree(Op.getValueType(), SVT)
                ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
                : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT);
   return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
 }
 
 // Match shuffles that can be converted to any_vector_extend_in_reg.
 // This is often generated during legalization.
 // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
 // TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
 static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
                                             SelectionDAG &DAG,
                                             const TargetLowering &TLI,
                                             bool LegalOperations,
                                             bool LegalTypes) {
   EVT VT = SVN->getValueType(0);
   bool IsBigEndian = DAG.getDataLayout().isBigEndian();
 
   // TODO Add support for big-endian when we have a test case.
   if (!VT.isInteger() || IsBigEndian)
     return SDValue();
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   ArrayRef<int> Mask = SVN->getMask();
   SDValue N0 = SVN->getOperand(0);
 
   // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
   auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) {
     for (unsigned i = 0; i != NumElts; ++i) {
       if (Mask[i] < 0)
         continue;
       if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
         continue;
       return false;
     }
     return true;
   };
 
   // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
   // power-of-2 extensions as they are the most likely.
   for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
     // Check for non power of 2 vector sizes
     if (NumElts % Scale != 0)
       continue;
     if (!isAnyExtend(Scale))
       continue;
 
     EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale);
     EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
     if (!LegalTypes || TLI.isTypeLegal(OutVT))
       if (!LegalOperations ||
           TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
         return DAG.getBitcast(VT,
                             DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT));
   }
 
   return SDValue();
 }
 
 // Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
 // each source element of a large type into the lowest elements of a smaller
 // destination type. This is often generated during legalization.
 // If the source node itself was a '*_extend_vector_inreg' node then we should
 // then be able to remove it.
 static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
                                         SelectionDAG &DAG) {
   EVT VT = SVN->getValueType(0);
   bool IsBigEndian = DAG.getDataLayout().isBigEndian();
 
   // TODO Add support for big-endian when we have a test case.
   if (!VT.isInteger() || IsBigEndian)
     return SDValue();
 
   SDValue N0 = peekThroughBitcast(SVN->getOperand(0));
 
   unsigned Opcode = N0.getOpcode();
   if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG &&
       Opcode != ISD::SIGN_EXTEND_VECTOR_INREG &&
       Opcode != ISD::ZERO_EXTEND_VECTOR_INREG)
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
   ArrayRef<int> Mask = SVN->getMask();
   unsigned NumElts = VT.getVectorNumElements();
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
   unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits();
 
   if (ExtDstSizeInBits % ExtSrcSizeInBits != 0)
     return SDValue();
   unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits;
 
   // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
   // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
   // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
   auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
     for (unsigned i = 0; i != NumElts; ++i) {
       if (Mask[i] < 0)
         continue;
       if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
         continue;
       return false;
     }
     return true;
   };
 
   // At the moment we just handle the case where we've truncated back to the
   // same size as before the extension.
   // TODO: handle more extension/truncation cases as cases arise.
   if (EltSizeInBits != ExtSrcSizeInBits)
     return SDValue();
 
   // We can remove *extend_vector_inreg only if the truncation happens at
   // the same scale as the extension.
   if (isTruncate(ExtScale))
     return DAG.getBitcast(VT, N00);
 
   return SDValue();
 }
 
 // Combine shuffles of splat-shuffles of the form:
 // shuffle (shuffle V, undef, splat-mask), undef, M
 // If splat-mask contains undef elements, we need to be careful about
 // introducing undef's in the folded mask which are not the result of composing
 // the masks of the shuffles.
 static SDValue combineShuffleOfSplat(ArrayRef<int> UserMask,
                                      ShuffleVectorSDNode *Splat,
                                      SelectionDAG &DAG) {
   ArrayRef<int> SplatMask = Splat->getMask();
   assert(UserMask.size() == SplatMask.size() && "Mask length mismatch");
 
   // Prefer simplifying to the splat-shuffle, if possible. This is legal if
   // every undef mask element in the splat-shuffle has a corresponding undef
   // element in the user-shuffle's mask or if the composition of mask elements
   // would result in undef.
   // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
   // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
   //   In this case it is not legal to simplify to the splat-shuffle because we
   //   may be exposing the users of the shuffle an undef element at index 1
   //   which was not there before the combine.
   // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
   //   In this case the composition of masks yields SplatMask, so it's ok to
   //   simplify to the splat-shuffle.
   // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
   //   In this case the composed mask includes all undef elements of SplatMask
   //   and in addition sets element zero to undef. It is safe to simplify to
   //   the splat-shuffle.
   auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
                                        ArrayRef<int> SplatMask) {
     for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
       if (UserMask[i] != -1 && SplatMask[i] == -1 &&
           SplatMask[UserMask[i]] != -1)
         return false;
     return true;
   };
   if (CanSimplifyToExistingSplat(UserMask, SplatMask))
     return SDValue(Splat, 0);
 
   // Create a new shuffle with a mask that is composed of the two shuffles'
   // masks.
   SmallVector<int, 32> NewMask;
   for (int Idx : UserMask)
     NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);
 
   return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
                               Splat->getOperand(0), Splat->getOperand(1),
                               NewMask);
 }
 
 /// If the shuffle mask is taking exactly one element from the first vector
 /// operand and passing through all other elements from the second vector
 /// operand, return the index of the mask element that is choosing an element
 /// from the first operand. Otherwise, return -1.
 static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) {
   int MaskSize = Mask.size();
   int EltFromOp0 = -1;
   // TODO: This does not match if there are undef elements in the shuffle mask.
   // Should we ignore undefs in the shuffle mask instead? The trade-off is
   // removing an instruction (a shuffle), but losing the knowledge that some
   // vector lanes are not needed.
   for (int i = 0; i != MaskSize; ++i) {
     if (Mask[i] >= 0 && Mask[i] < MaskSize) {
       // We're looking for a shuffle of exactly one element from operand 0.
       if (EltFromOp0 != -1)
         return -1;
       EltFromOp0 = i;
     } else if (Mask[i] != i + MaskSize) {
       // Nothing from operand 1 can change lanes.
       return -1;
     }
   }
   return EltFromOp0;
 }
 
 /// If a shuffle inserts exactly one element from a source vector operand into
 /// another vector operand and we can access the specified element as a scalar,
 /// then we can eliminate the shuffle.
 static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
                                       SelectionDAG &DAG) {
   // First, check if we are taking one element of a vector and shuffling that
   // element into another vector.
   ArrayRef<int> Mask = Shuf->getMask();
   SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end());
   SDValue Op0 = Shuf->getOperand(0);
   SDValue Op1 = Shuf->getOperand(1);
   int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask);
   if (ShufOp0Index == -1) {
     // Commute mask and check again.
     ShuffleVectorSDNode::commuteMask(CommutedMask);
     ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask);
     if (ShufOp0Index == -1)
       return SDValue();
     // Commute operands to match the commuted shuffle mask.
     std::swap(Op0, Op1);
     Mask = CommutedMask;
   }
 
   // The shuffle inserts exactly one element from operand 0 into operand 1.
   // Now see if we can access that element as a scalar via a real insert element
   // instruction.
   // TODO: We can try harder to locate the element as a scalar. Examples: it
   // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant.
   assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() &&
          "Shuffle mask value must be from operand 0");
   if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT)
     return SDValue();
 
   auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2));
   if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index])
     return SDValue();
 
   // There's an existing insertelement with constant insertion index, so we
   // don't need to check the legality/profitability of a replacement operation
   // that differs at most in the constant value. The target should be able to
   // lower any of those in a similar way. If not, legalization will expand this
   // to a scalar-to-vector plus shuffle.
   //
   // Note that the shuffle may move the scalar from the position that the insert
   // element used. Therefore, our new insert element occurs at the shuffle's
   // mask index value, not the insert's index value.
   // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
   SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),
                                         Op0.getOperand(2).getValueType());
   return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
                      Op1, Op0.getOperand(1), NewInsIndex);
 }
 
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
 
   // Canonicalize shuffle undef, undef -> undef
   if (N0.isUndef() && N1.isUndef())
     return DAG.getUNDEF(VT);
 
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
 
   // Canonicalize shuffle v, v -> v, undef
   if (N0 == N1) {
     SmallVector<int, 8> NewMask;
     for (unsigned i = 0; i != NumElts; ++i) {
       int Idx = SVN->getMaskElt(i);
       if (Idx >= (int)NumElts) Idx -= NumElts;
       NewMask.push_back(Idx);
     }
     return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask);
   }
 
   // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
   if (N0.isUndef())
     return DAG.getCommutedVectorShuffle(*SVN);
 
   // Remove references to rhs if it is undef
   if (N1.isUndef()) {
     bool Changed = false;
     SmallVector<int, 8> NewMask;
     for (unsigned i = 0; i != NumElts; ++i) {
       int Idx = SVN->getMaskElt(i);
       if (Idx >= (int)NumElts) {
         Idx = -1;
         Changed = true;
       }
       NewMask.push_back(Idx);
     }
     if (Changed)
       return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
   }
 
   if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
     return InsElt;
 
   // A shuffle of a single vector that is a splat can always be folded.
   if (auto *N0Shuf = dyn_cast<ShuffleVectorSDNode>(N0))
     if (N1->isUndef() && N0Shuf->isSplat())
       return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG);
 
   // If it is a splat, check if the argument vector is another splat or a
   // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
     SDNode *V = N0.getNode();
 
     // If this is a bit convert that changes the element type of the vector but
     // not the number of vector elements, look through it.  Be careful not to
     // look though conversions that change things like v4f32 to v2f64.
     if (V->getOpcode() == ISD::BITCAST) {
       SDValue ConvInput = V->getOperand(0);
       if (ConvInput.getValueType().isVector() &&
           ConvInput.getValueType().getVectorNumElements() == NumElts)
         V = ConvInput.getNode();
     }
 
     if (V->getOpcode() == ISD::BUILD_VECTOR) {
       assert(V->getNumOperands() == NumElts &&
              "BUILD_VECTOR has wrong number of operands");
       SDValue Base;
       bool AllSame = true;
       for (unsigned i = 0; i != NumElts; ++i) {
         if (!V->getOperand(i).isUndef()) {
           Base = V->getOperand(i);
           break;
         }
       }
       // Splat of <u, u, u, u>, return <u, u, u, u>
       if (!Base.getNode())
         return N0;
       for (unsigned i = 0; i != NumElts; ++i) {
         if (V->getOperand(i) != Base) {
           AllSame = false;
           break;
         }
       }
       // Splat of <x, x, x, x>, return <x, x, x, x>
       if (AllSame)
         return N0;
 
       // Canonicalize any other splat as a build_vector.
       const SDValue &Splatted = V->getOperand(SVN->getSplatIndex());
       SmallVector<SDValue, 8> Ops(NumElts, Splatted);
       SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
 
       // We may have jumped through bitcasts, so the type of the
       // BUILD_VECTOR may not match the type of the shuffle.
       if (V->getValueType(0) != VT)
         NewBV = DAG.getBitcast(VT, NewBV);
       return NewBV;
     }
   }
 
   // Simplify source operands based on shuffle mask.
   if (SimplifyDemandedVectorElts(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // Match shuffles that can be converted to any_vector_extend_in_reg.
   if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations, LegalTypes))
     return V;
 
   // Combine "truncate_vector_in_reg" style shuffles.
   if (SDValue V = combineTruncationShuffle(SVN, DAG))
     return V;
 
   if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
       Level < AfterLegalizeVectorOps &&
       (N1.isUndef() ||
       (N1.getOpcode() == ISD::CONCAT_VECTORS &&
        N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
     if (SDValue V = partitionShuffleOfConcats(N, DAG))
       return V;
   }
 
   // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
   // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
   if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
     if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))
       return Res;
 
   // If this shuffle only has a single input that is a bitcasted shuffle,
   // attempt to merge the 2 shuffles and suitably bitcast the inputs/output
   // back to their original types.
   if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
       N1.isUndef() && Level < AfterLegalizeVectorOps &&
       TLI.isTypeLegal(VT)) {
 
     // Peek through the bitcast only if there is one user.
     SDValue BC0 = N0;
     while (BC0.getOpcode() == ISD::BITCAST) {
       if (!BC0.hasOneUse())
         break;
       BC0 = BC0.getOperand(0);
     }
 
     auto ScaleShuffleMask = [](ArrayRef<int> Mask, int Scale) {
       if (Scale == 1)
         return SmallVector<int, 8>(Mask.begin(), Mask.end());
 
       SmallVector<int, 8> NewMask;
       for (int M : Mask)
         for (int s = 0; s != Scale; ++s)
           NewMask.push_back(M < 0 ? -1 : Scale * M + s);
       return NewMask;
     };
 
     if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
       EVT SVT = VT.getScalarType();
       EVT InnerVT = BC0->getValueType(0);
       EVT InnerSVT = InnerVT.getScalarType();
 
       // Determine which shuffle works with the smaller scalar type.
       EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
       EVT ScaleSVT = ScaleVT.getScalarType();
 
       if (TLI.isTypeLegal(ScaleVT) &&
           0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
           0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
         int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
         int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();
 
         // Scale the shuffle masks to the smaller scalar type.
         ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
         SmallVector<int, 8> InnerMask =
             ScaleShuffleMask(InnerSVN->getMask(), InnerScale);
         SmallVector<int, 8> OuterMask =
             ScaleShuffleMask(SVN->getMask(), OuterScale);
 
         // Merge the shuffle masks.
         SmallVector<int, 8> NewMask;
         for (int M : OuterMask)
           NewMask.push_back(M < 0 ? -1 : InnerMask[M]);
 
         // Test for shuffle mask legality over both commutations.
         SDValue SV0 = BC0->getOperand(0);
         SDValue SV1 = BC0->getOperand(1);
         bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
         if (!LegalMask) {
           std::swap(SV0, SV1);
           ShuffleVectorSDNode::commuteMask(NewMask);
           LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
         }
 
         if (LegalMask) {
           SV0 = DAG.getBitcast(ScaleVT, SV0);
           SV1 = DAG.getBitcast(ScaleVT, SV1);
           return DAG.getBitcast(
               VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
         }
       }
     }
   }
 
   // Canonicalize shuffles according to rules:
   //  shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
   //  shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
   //  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
   if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
       N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
       TLI.isTypeLegal(VT)) {
     // The incoming shuffle must be of the same type as the result of the
     // current shuffle.
     assert(N1->getOperand(0).getValueType() == VT &&
            "Shuffle types don't match");
 
     SDValue SV0 = N1->getOperand(0);
     SDValue SV1 = N1->getOperand(1);
     bool HasSameOp0 = N0 == SV0;
     bool IsSV1Undef = SV1.isUndef();
     if (HasSameOp0 || IsSV1Undef || N0 == SV1)
       // Commute the operands of this shuffle so that next rule
       // will trigger.
       return DAG.getCommutedVectorShuffle(*SVN);
   }
 
   // Try to fold according to rules:
   //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
   //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
   //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
   // Don't try to fold shuffles with illegal type.
   // Only fold if this shuffle is the only user of the other shuffle.
   if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) &&
       Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
     ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
 
     // Don't try to fold splats; they're likely to simplify somehow, or they
     // might be free.
     if (OtherSV->isSplat())
       return SDValue();
 
     // The incoming shuffle must be of the same type as the result of the
     // current shuffle.
     assert(OtherSV->getOperand(0).getValueType() == VT &&
            "Shuffle types don't match");
 
     SDValue SV0, SV1;
     SmallVector<int, 4> Mask;
     // Compute the combined shuffle mask for a shuffle with SV0 as the first
     // operand, and SV1 as the second operand.
     for (unsigned i = 0; i != NumElts; ++i) {
       int Idx = SVN->getMaskElt(i);
       if (Idx < 0) {
         // Propagate Undef.
         Mask.push_back(Idx);
         continue;
       }
 
       SDValue CurrentVec;
       if (Idx < (int)NumElts) {
         // This shuffle index refers to the inner shuffle N0. Lookup the inner
         // shuffle mask to identify which vector is actually referenced.
         Idx = OtherSV->getMaskElt(Idx);
         if (Idx < 0) {
           // Propagate Undef.
           Mask.push_back(Idx);
           continue;
         }
 
         CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0)
                                            : OtherSV->getOperand(1);
       } else {
         // This shuffle index references an element within N1.
         CurrentVec = N1;
       }
 
       // Simple case where 'CurrentVec' is UNDEF.
       if (CurrentVec.isUndef()) {
         Mask.push_back(-1);
         continue;
       }
 
       // Canonicalize the shuffle index. We don't know yet if CurrentVec
       // will be the first or second operand of the combined shuffle.
       Idx = Idx % NumElts;
       if (!SV0.getNode() || SV0 == CurrentVec) {
         // Ok. CurrentVec is the left hand side.
         // Update the mask accordingly.
         SV0 = CurrentVec;
         Mask.push_back(Idx);
         continue;
       }
 
       // Bail out if we cannot convert the shuffle pair into a single shuffle.
       if (SV1.getNode() && SV1 != CurrentVec)
         return SDValue();
 
       // Ok. CurrentVec is the right hand side.
       // Update the mask accordingly.
       SV1 = CurrentVec;
       Mask.push_back(Idx + NumElts);
     }
 
     // Check if all indices in Mask are Undef. In case, propagate Undef.
     bool isUndefMask = true;
     for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
       isUndefMask &= Mask[i] < 0;
 
     if (isUndefMask)
       return DAG.getUNDEF(VT);
 
     if (!SV0.getNode())
       SV0 = DAG.getUNDEF(VT);
     if (!SV1.getNode())
       SV1 = DAG.getUNDEF(VT);
 
     // Avoid introducing shuffles with illegal mask.
     if (!TLI.isShuffleMaskLegal(Mask, VT)) {
       ShuffleVectorSDNode::commuteMask(Mask);
 
       if (!TLI.isShuffleMaskLegal(Mask, VT))
         return SDValue();
 
       //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
       //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
       //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
       std::swap(SV0, SV1);
     }
 
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
     return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
   SDValue InVal = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
   // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern
   // with a VECTOR_SHUFFLE and possible truncate.
   if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     SDValue InVec = InVal->getOperand(0);
     SDValue EltNo = InVal->getOperand(1);
     auto InVecT = InVec.getValueType();
     if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) {
       SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);
       int Elt = C0->getZExtValue();
       NewMask[0] = Elt;
       SDValue Val;
       // If we have an implict truncate do truncate here as long as it's legal.
       // if it's not legal, this should
       if (VT.getScalarType() != InVal.getValueType() &&
           InVal.getValueType().isScalarInteger() &&
           isTypeLegal(VT.getScalarType())) {
         Val =
             DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal);
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
       }
       if (VT.getScalarType() == InVecT.getScalarType() &&
           VT.getVectorNumElements() <= InVecT.getVectorNumElements() &&
           TLI.isShuffleMaskLegal(NewMask, VT)) {
         Val = DAG.getVectorShuffle(InVecT, SDLoc(N), InVec,
                                    DAG.getUNDEF(InVecT), NewMask);
         // If the initial vector is the correct size this shuffle is a
         // valid result.
         if (VT == InVecT)
           return Val;
         // If not we must truncate the vector.
         if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
           MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
           SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy);
           EVT SubVT =
               EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(),
                                VT.getVectorNumElements());
           Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, Val,
                             ZeroIdx);
           return Val;
         }
       }
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
 
   // If inserting an UNDEF, just return the original vector.
   if (N1.isUndef())
     return N0;
 
   // For nested INSERT_SUBVECTORs, attempt to combine inner node first to allow
   // us to pull BITCASTs from input to output.
   if (N0.hasOneUse() && N0->getOpcode() == ISD::INSERT_SUBVECTOR)
     if (SDValue NN0 = visitINSERT_SUBVECTOR(N0.getNode()))
       return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, NN0, N1, N2);
 
   // If this is an insert of an extracted vector into an undef vector, we can
   // just use the input to the extract.
   if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
     return N1.getOperand(0);
 
   // If we are inserting a bitcast value into an undef, with the same
   // number of elements, just use the bitcast input of the extract.
   // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 ->
   //        BITCAST (INSERT_SUBVECTOR UNDEF N1 N2)
   if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST &&
       N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       N1.getOperand(0).getOperand(1) == N2 &&
       N1.getOperand(0).getOperand(0).getValueType().getVectorNumElements() ==
           VT.getVectorNumElements() &&
       N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
           VT.getSizeInBits()) {
     return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
   }
 
   // If both N1 and N2 are bitcast values on which insert_subvector
   // would makes sense, pull the bitcast through.
   // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 ->
   //        BITCAST (INSERT_SUBVECTOR N0 N1 N2)
   if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
     SDValue CN0 = N0.getOperand(0);
     SDValue CN1 = N1.getOperand(0);
     EVT CN0VT = CN0.getValueType();
     EVT CN1VT = CN1.getValueType();
     if (CN0VT.isVector() && CN1VT.isVector() &&
         CN0VT.getVectorElementType() == CN1VT.getVectorElementType() &&
         CN0VT.getVectorNumElements() == VT.getVectorNumElements()) {
       SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
                                       CN0.getValueType(), CN0, CN1, N2);
       return DAG.getBitcast(VT, NewINSERT);
     }
   }
 
   // Combine INSERT_SUBVECTORs where we are inserting to the same index.
   // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
   // --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
   if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
       N0.getOperand(1).getValueType() == N1.getValueType() &&
       N0.getOperand(2) == N2)
     return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
                        N1, N2);
 
   if (!isa<ConstantSDNode>(N2))
     return SDValue();
 
   unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue();
 
   // Canonicalize insert_subvector dag nodes.
   // Example:
   // (insert_subvector (insert_subvector A, Idx0), Idx1)
   // -> (insert_subvector (insert_subvector A, Idx1), Idx0)
   if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
       N1.getValueType() == N0.getOperand(1).getValueType() &&
       isa<ConstantSDNode>(N0.getOperand(2))) {
     unsigned OtherIdx = N0.getConstantOperandVal(2);
     if (InsIdx < OtherIdx) {
       // Swap nodes.
       SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT,
                                   N0.getOperand(0), N1, N2);
       AddToWorklist(NewOp.getNode());
       return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
                          VT, NewOp, N0.getOperand(1), N0.getOperand(2));
     }
   }
 
   // If the input vector is a concatenation, and the insert replaces
   // one of the pieces, we can optimize into a single concat_vectors.
   if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
       N0.getOperand(0).getValueType() == N1.getValueType()) {
     unsigned Factor = N1.getValueType().getVectorNumElements();
 
     SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
     Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1;
 
     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
   SDValue N0 = N->getOperand(0);
 
   // fold (fp_to_fp16 (fp16_to_fp op)) -> op
   if (N0->getOpcode() == ISD::FP16_TO_FP)
     return N0->getOperand(0);
 
   return SDValue();
 }
 
 SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
 
   // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
   if (N0->getOpcode() == ISD::AND) {
     ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
     if (AndConst && AndConst->getAPIntValue() == 0xffff) {
       return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
                          N0.getOperand(0));
     }
   }
 
   return SDValue();
 }
 
 /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
 /// with the destination vector and a zero vector.
 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
 ///      vector_shuffle V, Zero, <0, 4, 2, 4>
 SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
   assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
 
   EVT VT = N->getValueType(0);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = peekThroughBitcast(N->getOperand(1));
   SDLoc DL(N);
 
   // Make sure we're not running after operation legalization where it
   // may have custom lowered the vector shuffles.
   if (LegalOperations)
     return SDValue();
 
   if (RHS.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
 
   EVT RVT = RHS.getValueType();
   unsigned NumElts = RHS.getNumOperands();
 
   // Attempt to create a valid clear mask, splitting the mask into
   // sub elements and checking to see if each is
   // all zeros or all ones - suitable for shuffle masking.
   auto BuildClearMask = [&](int Split) {
     int NumSubElts = NumElts * Split;
     int NumSubBits = RVT.getScalarSizeInBits() / Split;
 
     SmallVector<int, 8> Indices;
     for (int i = 0; i != NumSubElts; ++i) {
       int EltIdx = i / Split;
       int SubIdx = i % Split;
       SDValue Elt = RHS.getOperand(EltIdx);
       if (Elt.isUndef()) {
         Indices.push_back(-1);
         continue;
       }
 
       APInt Bits;
       if (isa<ConstantSDNode>(Elt))
         Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
       else if (isa<ConstantFPSDNode>(Elt))
         Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
       else
         return SDValue();
 
       // Extract the sub element from the constant bit mask.
       if (DAG.getDataLayout().isBigEndian()) {
         Bits.lshrInPlace((Split - SubIdx - 1) * NumSubBits);
       } else {
         Bits.lshrInPlace(SubIdx * NumSubBits);
       }
 
       if (Split > 1)
         Bits = Bits.trunc(NumSubBits);
 
       if (Bits.isAllOnesValue())
         Indices.push_back(i);
       else if (Bits == 0)
         Indices.push_back(i + NumSubElts);
       else
         return SDValue();
     }
 
     // Let's see if the target supports this vector_shuffle.
     EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
     EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
     if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
       return SDValue();
 
     SDValue Zero = DAG.getConstant(0, DL, ClearVT);
     return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL,
                                                    DAG.getBitcast(ClearVT, LHS),
                                                    Zero, Indices));
   };
 
   // Determine maximum split level (byte level masking).
   int MaxSplit = 1;
   if (RVT.getScalarSizeInBits() % 8 == 0)
     MaxSplit = RVT.getScalarSizeInBits() / 8;
 
   for (int Split = 1; Split <= MaxSplit; ++Split)
     if (RVT.getScalarSizeInBits() % Split == 0)
       if (SDValue S = BuildClearMask(Split))
         return S;
 
   return SDValue();
 }
 
 /// Visit a binary vector operation, like ADD.
 SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
   assert(N->getValueType(0).isVector() &&
          "SimplifyVBinOp only works on vectors!");
 
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   SDValue Ops[] = {LHS, RHS};
 
   // See if we can constant fold the vector operation.
   if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
           N->getOpcode(), SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags()))
     return Fold;
 
   // Type legalization might introduce new shuffles in the DAG.
   // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
   //   -> (shuffle (VBinOp (A, B)), Undef, Mask).
   if (LegalTypes && isa<ShuffleVectorSDNode>(LHS) &&
       isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
       LHS.getOperand(1).isUndef() &&
       RHS.getOperand(1).isUndef()) {
     ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
     ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
 
     if (SVN0->getMask().equals(SVN1->getMask())) {
       EVT VT = N->getValueType(0);
       SDValue UndefVector = LHS.getOperand(1);
       SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                      LHS.getOperand(0), RHS.getOperand(0),
                                      N->getFlags());
       AddUsersToWorklist(N);
       return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
                                   SVN0->getMask());
     }
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
                                     SDValue N2) {
   assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
 
   SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
                                  cast<CondCodeSDNode>(N0.getOperand(2))->get());
 
   // If we got a simplified select_cc node back from SimplifySelectCC, then
   // break it down into a new SETCC node, and a new SELECT node, and then return
   // the SELECT node, since we were called with a SELECT node.
   if (SCC.getNode()) {
     // Check to see if we got a select_cc back (to turn into setcc/select).
     // Otherwise, just return whatever node we got back, like fabs.
     if (SCC.getOpcode() == ISD::SELECT_CC) {
       SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
                                   N0.getValueType(),
                                   SCC.getOperand(0), SCC.getOperand(1),
                                   SCC.getOperand(4));
       AddToWorklist(SETCC.getNode());
       return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
                            SCC.getOperand(2), SCC.getOperand(3));
     }
 
     return SCC;
   }
   return SDValue();
 }
 
 /// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
 /// being selected between, see if we can simplify the select.  Callers of this
 /// should assume that TheSelect is deleted if this returns true.  As such, they
 /// should return the appropriate thing (e.g. the node) back to the top-level of
 /// the DAG combiner loop to avoid it being looked at.
 bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
                                     SDValue RHS) {
   // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
   // The select + setcc is redundant, because fsqrt returns NaN for X < 0.
   if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
     if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) {
       // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?))
       SDValue Sqrt = RHS;
       ISD::CondCode CC;
       SDValue CmpLHS;
       const ConstantFPSDNode *Zero = nullptr;
 
       if (TheSelect->getOpcode() == ISD::SELECT_CC) {
         CC = cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
         CmpLHS = TheSelect->getOperand(0);
         Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
       } else {
         // SELECT or VSELECT
         SDValue Cmp = TheSelect->getOperand(0);
         if (Cmp.getOpcode() == ISD::SETCC) {
           CC = cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
           CmpLHS = Cmp.getOperand(0);
           Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
         }
       }
       if (Zero && Zero->isZero() &&
           Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT ||
           CC == ISD::SETULT || CC == ISD::SETLT)) {
         // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
         CombineTo(TheSelect, Sqrt);
         return true;
       }
     }
   }
   // Cannot simplify select with vector condition
   if (TheSelect->getOperand(0).getValueType().isVector()) return false;
 
   // If this is a select from two identical things, try to pull the operation
   // through the select.
   if (LHS.getOpcode() != RHS.getOpcode() ||
       !LHS.hasOneUse() || !RHS.hasOneUse())
     return false;
 
   // If this is a load and the token chain is identical, replace the select
   // of two loads with a load through a select of the address to load from.
   // This triggers in things like "select bool X, 10.0, 123.0" after the FP
   // constants have been dropped into the constant pool.
   if (LHS.getOpcode() == ISD::LOAD) {
     LoadSDNode *LLD = cast<LoadSDNode>(LHS);
     LoadSDNode *RLD = cast<LoadSDNode>(RHS);
 
     // Token chains must be identical.
     if (LHS.getOperand(0) != RHS.getOperand(0) ||
         // Do not let this transformation reduce the number of volatile loads.
         LLD->isVolatile() || RLD->isVolatile() ||
         // FIXME: If either is a pre/post inc/dec load,
         // we'd need to split out the address adjustment.
         LLD->isIndexed() || RLD->isIndexed() ||
         // If this is an EXTLOAD, the VT's must match.
         LLD->getMemoryVT() != RLD->getMemoryVT() ||
         // If this is an EXTLOAD, the kind of extension must match.
         (LLD->getExtensionType() != RLD->getExtensionType() &&
          // The only exception is if one of the extensions is anyext.
          LLD->getExtensionType() != ISD::EXTLOAD &&
          RLD->getExtensionType() != ISD::EXTLOAD) ||
         // FIXME: this discards src value information.  This is
         // over-conservative. It would be beneficial to be able to remember
         // both potential memory locations.  Since we are discarding
         // src value info, don't do the transformation if the memory
         // locations are not in the default address space.
         LLD->getPointerInfo().getAddrSpace() != 0 ||
         RLD->getPointerInfo().getAddrSpace() != 0 ||
         !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(),
                                       LLD->getBasePtr().getValueType()))
       return false;
 
     // Check that the select condition doesn't reach either load.  If so,
     // folding this will induce a cycle into the DAG.  If not, this is safe to
     // xform, so create a select of the addresses.
     SDValue Addr;
     if (TheSelect->getOpcode() == ISD::SELECT) {
       SDNode *CondNode = TheSelect->getOperand(0).getNode();
       if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) ||
           (RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode)))
         return false;
       // The loads must not depend on one another.
       if (LLD->isPredecessorOf(RLD) ||
           RLD->isPredecessorOf(LLD))
         return false;
       Addr = DAG.getSelect(SDLoc(TheSelect),
                            LLD->getBasePtr().getValueType(),
                            TheSelect->getOperand(0), LLD->getBasePtr(),
                            RLD->getBasePtr());
     } else {  // Otherwise SELECT_CC
       SDNode *CondLHS = TheSelect->getOperand(0).getNode();
       SDNode *CondRHS = TheSelect->getOperand(1).getNode();
 
       if ((LLD->hasAnyUseOfValue(1) &&
            (LLD->isPredecessorOf(CondLHS) || LLD->isPredecessorOf(CondRHS))) ||
           (RLD->hasAnyUseOfValue(1) &&
            (RLD->isPredecessorOf(CondLHS) || RLD->isPredecessorOf(CondRHS))))
         return false;
 
       Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect),
                          LLD->getBasePtr().getValueType(),
                          TheSelect->getOperand(0),
                          TheSelect->getOperand(1),
                          LLD->getBasePtr(), RLD->getBasePtr(),
                          TheSelect->getOperand(4));
     }
 
     SDValue Load;
     // It is safe to replace the two loads if they have different alignments,
     // but the new load must be the minimum (most restrictive) alignment of the
     // inputs.
     unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment());
     MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
     if (!RLD->isInvariant())
       MMOFlags &= ~MachineMemOperand::MOInvariant;
     if (!RLD->isDereferenceable())
       MMOFlags &= ~MachineMemOperand::MODereferenceable;
     if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
       // FIXME: Discards pointer and AA info.
       Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
                          LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
                          MMOFlags);
     } else {
       // FIXME: Discards pointer and AA info.
       Load = DAG.getExtLoad(
           LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
                                                   : LLD->getExtensionType(),
           SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
           MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
     }
 
     // Users of the select now use the result of the load.
     CombineTo(TheSelect, Load);
 
     // Users of the old loads now use the new load's chain.  We know the
     // old-load value is dead now.
     CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
     CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
     return true;
   }
 
   return false;
 }
 
 /// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and
 /// bitwise 'and'.
 SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
                                             SDValue N1, SDValue N2, SDValue N3,
                                             ISD::CondCode CC) {
   // If this is a select where the false operand is zero and the compare is a
   // check of the sign bit, see if we can perform the "gzip trick":
   // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A
   // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A
   EVT XType = N0.getValueType();
   EVT AType = N2.getValueType();
   if (!isNullConstant(N3) || !XType.bitsGE(AType))
     return SDValue();
 
   // If the comparison is testing for a positive value, we have to invert
   // the sign bit mask, so only do that transform if the target has a bitwise
   // 'and not' instruction (the invert is free).
   if (CC == ISD::SETGT && TLI.hasAndNot(N2)) {
     // (X > -1) ? A : 0
     // (X >  0) ? X : 0 <-- This is canonical signed max.
     if (!(isAllOnesConstant(N1) || (isNullConstant(N1) && N0 == N2)))
       return SDValue();
   } else if (CC == ISD::SETLT) {
     // (X <  0) ? A : 0
     // (X <  1) ? X : 0 <-- This is un-canonicalized signed min.
     if (!(isNullConstant(N1) || (isOneConstant(N1) && N0 == N2)))
       return SDValue();
   } else {
     return SDValue();
   }
 
   // and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit
   // constant.
   EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
   auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
   if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) {
     unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1;
     SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
     SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt);
     AddToWorklist(Shift.getNode());
 
     if (XType.bitsGT(AType)) {
       Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
       AddToWorklist(Shift.getNode());
     }
 
     if (CC == ISD::SETGT)
       Shift = DAG.getNOT(DL, Shift, AType);
 
     return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
   }
 
   SDValue ShiftAmt = DAG.getConstant(XType.getSizeInBits() - 1, DL, ShiftAmtTy);
   SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt);
   AddToWorklist(Shift.getNode());
 
   if (XType.bitsGT(AType)) {
     Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
     AddToWorklist(Shift.getNode());
   }
 
   if (CC == ISD::SETGT)
     Shift = DAG.getNOT(DL, Shift, AType);
 
   return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
 }
 
 /// Simplify an expression of the form (N0 cond N1) ? N2 : N3
 /// where 'cond' is the comparison specified by CC.
 SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
                                       SDValue N2, SDValue N3, ISD::CondCode CC,
                                       bool NotExtCompare) {
   // (x ? y : y) -> y.
   if (N2 == N3) return N2;
 
   EVT VT = N2.getValueType();
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
 
   // Determine if the condition we're dealing with is constant
   SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()),
                               N0, N1, CC, DL, false);
   if (SCC.getNode()) AddToWorklist(SCC.getNode());
 
   if (ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) {
     // fold select_cc true, x, y -> x
     // fold select_cc false, x, y -> y
     return !SCCC->isNullValue() ? N2 : N3;
   }
 
   // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
   // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
   // in it.  This is a win when the constant is not otherwise available because
   // it replaces two constant pool loads with one.  We only do this if the FP
   // type is known to be legal, because if it isn't, then we are before legalize
   // types an we want the other legalization to happen first (e.g. to avoid
   // messing with soft float) and if the ConstantFP is not legal, because if
   // it is legal, we may not need to store the FP constant in a constant pool.
   if (ConstantFPSDNode *TV = dyn_cast<ConstantFPSDNode>(N2))
     if (ConstantFPSDNode *FV = dyn_cast<ConstantFPSDNode>(N3)) {
       if (TLI.isTypeLegal(N2.getValueType()) &&
           (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) !=
                TargetLowering::Legal &&
            !TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) &&
            !TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) &&
           // If both constants have multiple uses, then we won't need to do an
           // extra load, they are likely around in registers for other users.
           (TV->hasOneUse() || FV->hasOneUse())) {
         Constant *Elts[] = {
           const_cast<ConstantFP*>(FV->getConstantFPValue()),
           const_cast<ConstantFP*>(TV->getConstantFPValue())
         };
         Type *FPTy = Elts[0]->getType();
         const DataLayout &TD = DAG.getDataLayout();
 
         // Create a ConstantArray of the two constants.
         Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
         SDValue CPIdx =
             DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
                                 TD.getPrefTypeAlignment(FPTy));
         unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
 
         // Get the offsets to the 0 and 1 element of the array so that we can
         // select between them.
         SDValue Zero = DAG.getIntPtrConstant(0, DL);
         unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
         SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));
 
         SDValue Cond = DAG.getSetCC(DL,
                                     getSetCCResultType(N0.getValueType()),
                                     N0, N1, CC);
         AddToWorklist(Cond.getNode());
         SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(),
                                           Cond, One, Zero);
         AddToWorklist(CstOffset.getNode());
         CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
                             CstOffset);
         AddToWorklist(CPIdx.getNode());
         return DAG.getLoad(
             TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
             MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
             Alignment);
       }
     }
 
   if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC))
     return V;
 
   // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A)
   // where y is has a single bit set.
   // A plaintext description would be, we can turn the SELECT_CC into an AND
   // when the condition can be materialized as an all-ones register.  Any
   // single bit-test can be materialized as an all-ones register with
   // shift-left and shift-right-arith.
   if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
       N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
     SDValue AndLHS = N0->getOperand(0);
     ConstantSDNode *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
       // Shift the tested bit over the sign bit.
       const APInt &AndMask = ConstAndRHS->getAPIntValue();
       SDValue ShlAmt =
         DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS),
                         getShiftAmountTy(AndLHS.getValueType()));
       SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);
 
       // Now arithmetic right shift it all the way over, so the result is either
       // all-ones, or zero.
       SDValue ShrAmt =
         DAG.getConstant(AndMask.getBitWidth() - 1, SDLoc(Shl),
                         getShiftAmountTy(Shl.getValueType()));
       SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);
 
       return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
     }
   }
 
   // fold select C, 16, 0 -> shl C, 4
   if (N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2() &&
       TLI.getBooleanContents(N0.getValueType()) ==
           TargetLowering::ZeroOrOneBooleanContent) {
 
     // If the caller doesn't want us to simplify this into a zext of a compare,
     // don't do it.
     if (NotExtCompare && N2C->isOne())
       return SDValue();
 
     // Get a SetCC of the condition
     // NOTE: Don't create a SETCC if it's not legal on this target.
     if (!LegalOperations ||
         TLI.isOperationLegal(ISD::SETCC, N0.getValueType())) {
       SDValue Temp, SCC;
       // cast from setcc result type to select result type
       if (LegalTypes) {
         SCC  = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()),
                             N0, N1, CC);
         if (N2.getValueType().bitsLT(SCC.getValueType()))
           Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2),
                                         N2.getValueType());
         else
           Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
                              N2.getValueType(), SCC);
       } else {
         SCC  = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
         Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
                            N2.getValueType(), SCC);
       }
 
       AddToWorklist(SCC.getNode());
       AddToWorklist(Temp.getNode());
 
       if (N2C->isOne())
         return Temp;
 
       // shl setcc result by log2 n2c
       return DAG.getNode(
           ISD::SHL, DL, N2.getValueType(), Temp,
           DAG.getConstant(N2C->getAPIntValue().logBase2(), SDLoc(Temp),
                           getShiftAmountTy(Temp.getValueType())));
     }
   }
 
   // Check to see if this is an integer abs.
   // select_cc setg[te] X,  0,  X, -X ->
   // select_cc setgt    X, -1,  X, -X ->
   // select_cc setl[te] X,  0, -X,  X ->
   // select_cc setlt    X,  1, -X,  X ->
   // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
   if (N1C) {
     ConstantSDNode *SubC = nullptr;
     if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
          (N1C->isAllOnesValue() && CC == ISD::SETGT)) &&
         N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1))
       SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0));
     else if (((N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE)) ||
               (N1C->isOne() && CC == ISD::SETLT)) &&
              N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1))
       SubC = dyn_cast<ConstantSDNode>(N2.getOperand(0));
 
     EVT XType = N0.getValueType();
     if (SubC && SubC->isNullValue() && XType.isInteger()) {
       SDLoc DL(N0);
       SDValue Shift = DAG.getNode(ISD::SRA, DL, XType,
                                   N0,
                                   DAG.getConstant(XType.getSizeInBits() - 1, DL,
                                          getShiftAmountTy(N0.getValueType())));
       SDValue Add = DAG.getNode(ISD::ADD, DL,
                                 XType, N0, Shift);
       AddToWorklist(Shift.getNode());
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
     }
   }
 
   // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
   // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X)
   // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X)
   // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X)
   // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X)
   // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
   // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
   // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
   if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     SDValue ValueOnZero = N2;
     SDValue Count = N3;
     // If the condition is NE instead of E, swap the operands.
     if (CC == ISD::SETNE)
       std::swap(ValueOnZero, Count);
     // Check if the value on zero is a constant equal to the bits in the type.
     if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) {
       if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) {
         // If the other operand is cttz/cttz_zero_undef of N0, and cttz is
         // legal, combine to just cttz.
         if ((Count.getOpcode() == ISD::CTTZ ||
              Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
             N0 == Count.getOperand(0) &&
             (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT)))
           return DAG.getNode(ISD::CTTZ, DL, VT, N0);
         // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is
         // legal, combine to just ctlz.
         if ((Count.getOpcode() == ISD::CTLZ ||
              Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) &&
             N0 == Count.getOperand(0) &&
             (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT)))
           return DAG.getNode(ISD::CTLZ, DL, VT, N0);
       }
     }
   }
 
   return SDValue();
 }
 
 /// This is a stub for TargetLowering::SimplifySetCC.
 SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                                    ISD::CondCode Cond, const SDLoc &DL,
                                    bool foldBooleans) {
   TargetLowering::DAGCombinerInfo
     DagCombineInfo(DAG, Level, false, this);
   return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
 }
 
 /// Given an ISD::SDIV node expressing a divide by constant, return
 /// a DAG expression to select that will generate the same value by multiplying
 /// by a magic number.
 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue DAGCombiner::BuildSDIV(SDNode *N) {
   // when optimising for minimum size, we don't want to expand a div to a mul
   // and a shift.
   if (DAG.getMachineFunction().getFunction().optForMinSize())
     return SDValue();
 
   ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
   if (!C)
     return SDValue();
 
   // Avoid division by zero.
   if (C->isNullValue())
     return SDValue();
 
   SmallVector<SDNode *, 8> Built;
   SDValue S =
       TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, Built);
 
   for (SDNode *N : Built)
     AddToWorklist(N);
   return S;
 }
 
 /// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
 /// DAG expression that will generate the same value by right shifting.
 SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
   ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
   if (!C)
     return SDValue();
 
   // Avoid division by zero.
   if (C->isNullValue())
     return SDValue();
 
   SmallVector<SDNode *, 8> Built;
   SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, Built);
 
   for (SDNode *N : Built)
     AddToWorklist(N);
   return S;
 }
 
 /// Given an ISD::UDIV node expressing a divide by constant, return a DAG
 /// expression that will generate the same value by multiplying by a magic
 /// number.
 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
 SDValue DAGCombiner::BuildUDIV(SDNode *N) {
   // when optimising for minimum size, we don't want to expand a div to a mul
   // and a shift.
   if (DAG.getMachineFunction().getFunction().optForMinSize())
     return SDValue();
 
   ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
   if (!C)
     return SDValue();
 
   // Avoid division by zero.
   if (C->isNullValue())
     return SDValue();
 
   SmallVector<SDNode *, 8> Built;
   SDValue S =
       TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, Built);
 
   for (SDNode *N : Built)
     AddToWorklist(N);
   return S;
 }
 
 /// Determines the LogBase2 value for a non-null input value using the
 /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
 SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
   EVT VT = V.getValueType();
   unsigned EltBits = VT.getScalarSizeInBits();
   SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V);
   SDValue Base = DAG.getConstant(EltBits - 1, DL, VT);
   SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz);
   return LogBase2;
 }
 
 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
 /// For the reciprocal, we need to find the zero of the function:
 ///   F(X) = A X - 1 [which has a zero at X = 1/A]
 ///     =>
 ///   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
 ///     does not require additional intermediate precision]
 SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
 
   // TODO: Handle half and/or extended types?
   EVT VT = Op.getValueType();
   if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
     return SDValue();
 
   // If estimates are explicitly disabled for this function, we're done.
   MachineFunction &MF = DAG.getMachineFunction();
   int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF);
   if (Enabled == TLI.ReciprocalEstimate::Disabled)
     return SDValue();
 
   // Estimates may be explicitly enabled for this type with a custom number of
   // refinement steps.
   int Iterations = TLI.getDivRefinementSteps(VT, MF);
   if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
     AddToWorklist(Est.getNode());
 
     if (Iterations) {
       EVT VT = Op.getValueType();
       SDLoc DL(Op);
       SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
 
       // Newton iterations: Est = Est + Est (1 - Arg * Est)
       for (int i = 0; i < Iterations; ++i) {
         SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
         AddToWorklist(NewEst.getNode());
 
         NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags);
         AddToWorklist(NewEst.getNode());
 
         NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
         AddToWorklist(NewEst.getNode());
 
         Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags);
         AddToWorklist(Est.getNode());
       }
     }
     return Est;
   }
 
   return SDValue();
 }
 
 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
 /// For the reciprocal sqrt, we need to find the zero of the function:
 ///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
 ///     =>
 ///   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
 /// As a result, we precompute A/2 prior to the iteration loop.
 SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
                                          unsigned Iterations,
                                          SDNodeFlags Flags, bool Reciprocal) {
   EVT VT = Arg.getValueType();
   SDLoc DL(Arg);
   SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
 
   // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
   // this entire sequence requires only one FP constant.
   SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
   AddToWorklist(HalfArg.getNode());
 
   HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
   AddToWorklist(HalfArg.getNode());
 
   // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
   for (unsigned i = 0; i < Iterations; ++i) {
     SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
     AddToWorklist(NewEst.getNode());
 
     NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
     AddToWorklist(NewEst.getNode());
 
     NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
     AddToWorklist(NewEst.getNode());
 
     Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
     AddToWorklist(Est.getNode());
   }
 
   // If non-reciprocal square root is requested, multiply the result by Arg.
   if (!Reciprocal) {
     Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
     AddToWorklist(Est.getNode());
   }
 
   return Est;
 }
 
 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
 /// For the reciprocal sqrt, we need to find the zero of the function:
 ///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
 ///     =>
 ///   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
 SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
                                          unsigned Iterations,
                                          SDNodeFlags Flags, bool Reciprocal) {
   EVT VT = Arg.getValueType();
   SDLoc DL(Arg);
   SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
   SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);
 
   // This routine must enter the loop below to work correctly
   // when (Reciprocal == false).
   assert(Iterations > 0);
 
   // Newton iterations for reciprocal square root:
   // E = (E * -0.5) * ((A * E) * E + -3.0)
   for (unsigned i = 0; i < Iterations; ++i) {
     SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
     AddToWorklist(AE.getNode());
 
     SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
     AddToWorklist(AEE.getNode());
 
     SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
     AddToWorklist(RHS.getNode());
 
     // When calculating a square root at the last iteration build:
     // S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
     // (notice a common subexpression)
     SDValue LHS;
     if (Reciprocal || (i + 1) < Iterations) {
       // RSQRT: LHS = (E * -0.5)
       LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
     } else {
       // SQRT: LHS = (A * E) * -0.5
       LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
     }
     AddToWorklist(LHS.getNode());
 
     Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
     AddToWorklist(Est.getNode());
   }
 
   return Est;
 }
 
 /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
 /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
 /// Op can be zero.
 SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
                                            bool Reciprocal) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
 
   // TODO: Handle half and/or extended types?
   EVT VT = Op.getValueType();
   if (VT.getScalarType() != MVT::f32 && VT.getScalarType() != MVT::f64)
     return SDValue();
 
   // If estimates are explicitly disabled for this function, we're done.
   MachineFunction &MF = DAG.getMachineFunction();
   int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF);
   if (Enabled == TLI.ReciprocalEstimate::Disabled)
     return SDValue();
 
   // Estimates may be explicitly enabled for this type with a custom number of
   // refinement steps.
   int Iterations = TLI.getSqrtRefinementSteps(VT, MF);
 
   bool UseOneConstNR = false;
   if (SDValue Est =
       TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
                           Reciprocal)) {
     AddToWorklist(Est.getNode());
 
     if (Iterations) {
       Est = UseOneConstNR
             ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
             : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
 
       if (!Reciprocal) {
         // The estimate is now completely wrong if the input was exactly 0.0 or
         // possibly a denormal. Force the answer to 0.0 for those cases.
         EVT VT = Op.getValueType();
         SDLoc DL(Op);
         EVT CCVT = getSetCCResultType(VT);
         ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
         const Function &F = DAG.getMachineFunction().getFunction();
         Attribute Denorms = F.getFnAttribute("denormal-fp-math");
         if (Denorms.getValueAsString().equals("ieee")) {
           // fabs(X) < SmallestNormal ? 0.0 : Est
           const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
           APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
           SDValue NormC = DAG.getConstantFP(SmallestNorm, DL, VT);
           SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
           SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
           SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
           Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
           AddToWorklist(Fabs.getNode());
           AddToWorklist(IsDenorm.getNode());
           AddToWorklist(Est.getNode());
         } else {
           // X == 0.0 ? 0.0 : Est
           SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
           SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
           Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
           AddToWorklist(IsZero.getNode());
           AddToWorklist(Est.getNode());
         }
       }
     }
     return Est;
   }
 
   return SDValue();
 }
 
 SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
   return buildSqrtEstimateImpl(Op, Flags, true);
 }
 
 SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
   return buildSqrtEstimateImpl(Op, Flags, false);
 }
 
 /// Return true if there is any possibility that the two addresses overlap.
 bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
   // If they are the same then they must be aliases.
   if (Op0->getBasePtr() == Op1->getBasePtr()) return true;
 
   // If they are both volatile then they cannot be reordered.
   if (Op0->isVolatile() && Op1->isVolatile()) return true;
 
   // If one operation reads from invariant memory, and the other may store, they
   // cannot alias. These should really be checking the equivalent of mayWrite,
   // but it only matters for memory nodes other than load /store.
   if (Op0->isInvariant() && Op1->writeMem())
     return false;
 
   if (Op1->isInvariant() && Op0->writeMem())
     return false;
 
   unsigned NumBytes0 = Op0->getMemoryVT().getStoreSize();
   unsigned NumBytes1 = Op1->getMemoryVT().getStoreSize();
 
   // Check for BaseIndexOffset matching.
   BaseIndexOffset BasePtr0 = BaseIndexOffset::match(Op0, DAG);
   BaseIndexOffset BasePtr1 = BaseIndexOffset::match(Op1, DAG);
   int64_t PtrDiff;
   if (BasePtr0.getBase().getNode() && BasePtr1.getBase().getNode()) {
     if (BasePtr0.equalBaseIndex(BasePtr1, DAG, PtrDiff))
       return !((NumBytes0 <= PtrDiff) || (PtrDiff + NumBytes1 <= 0));
 
     // If both BasePtr0 and BasePtr1 are FrameIndexes, we will not be
     // able to calculate their relative offset if at least one arises
     // from an alloca. However, these allocas cannot overlap and we
     // can infer there is no alias.
     if (auto *A = dyn_cast<FrameIndexSDNode>(BasePtr0.getBase()))
       if (auto *B = dyn_cast<FrameIndexSDNode>(BasePtr1.getBase())) {
         MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
         // If the base are the same frame index but the we couldn't find a
         // constant offset, (indices are different) be conservative.
         if (A != B && (!MFI.isFixedObjectIndex(A->getIndex()) ||
                        !MFI.isFixedObjectIndex(B->getIndex())))
           return false;
       }
 
     bool IsFI0 = isa<FrameIndexSDNode>(BasePtr0.getBase());
     bool IsFI1 = isa<FrameIndexSDNode>(BasePtr1.getBase());
     bool IsGV0 = isa<GlobalAddressSDNode>(BasePtr0.getBase());
     bool IsGV1 = isa<GlobalAddressSDNode>(BasePtr1.getBase());
     bool IsCV0 = isa<ConstantPoolSDNode>(BasePtr0.getBase());
     bool IsCV1 = isa<ConstantPoolSDNode>(BasePtr1.getBase());
 
     // If of mismatched base types or checkable indices we can check
     // they do not alias.
     if ((BasePtr0.getIndex() == BasePtr1.getIndex() || (IsFI0 != IsFI1) ||
          (IsGV0 != IsGV1) || (IsCV0 != IsCV1)) &&
         (IsFI0 || IsGV0 || IsCV0) && (IsFI1 || IsGV1 || IsCV1))
       return false;
   }
 
   // If we know required SrcValue1 and SrcValue2 have relatively large
   // alignment compared to the size and offset of the access, we may be able
   // to prove they do not alias. This check is conservative for now to catch
   // cases created by splitting vector types.
   int64_t SrcValOffset0 = Op0->getSrcValueOffset();
   int64_t SrcValOffset1 = Op1->getSrcValueOffset();
   unsigned OrigAlignment0 = Op0->getOriginalAlignment();
   unsigned OrigAlignment1 = Op1->getOriginalAlignment();
   if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
       NumBytes0 == NumBytes1 && OrigAlignment0 > NumBytes0) {
     int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0;
     int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1;
 
     // There is no overlap between these relatively aligned accesses of
     // similar size. Return no alias.
     if ((OffAlign0 + NumBytes0) <= OffAlign1 ||
         (OffAlign1 + NumBytes1) <= OffAlign0)
       return false;
   }
 
   bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
                    ? CombinerGlobalAA
                    : DAG.getSubtarget().useAA();
 #ifndef NDEBUG
   if (CombinerAAOnlyFunc.getNumOccurrences() &&
       CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
     UseAA = false;
 #endif
 
   if (UseAA && AA &&
       Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
     int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset;
     int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset;
     AliasResult AAResult =
         AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0,
                                  UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
                   MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1,
                                  UseTBAA ? Op1->getAAInfo() : AAMDNodes()) );
     if (AAResult == NoAlias)
       return false;
   }
 
   // Otherwise we have to assume they alias.
   return true;
 }
 
 /// Walk up chain skipping non-aliasing memory nodes,
 /// looking for aliasing nodes and adding them to the Aliases vector.
 void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
                                    SmallVectorImpl<SDValue> &Aliases) {
   SmallVector<SDValue, 8> Chains;     // List of chains to visit.
   SmallPtrSet<SDNode *, 16> Visited;  // Visited node set.
 
   // Get alias information for node.
   bool IsLoad = isa<LoadSDNode>(N) && !cast<LSBaseSDNode>(N)->isVolatile();
 
   // Starting off.
   Chains.push_back(OriginalChain);
   unsigned Depth = 0;
 
   // Look at each chain and determine if it is an alias.  If so, add it to the
   // aliases list.  If not, then continue up the chain looking for the next
   // candidate.
   while (!Chains.empty()) {
     SDValue Chain = Chains.pop_back_val();
 
     // For TokenFactor nodes, look at each operand and only continue up the
     // chain until we reach the depth limit.
     //
     // FIXME: The depth check could be made to return the last non-aliasing
     // chain we found before we hit a tokenfactor rather than the original
     // chain.
     if (Depth > TLI.getGatherAllAliasesMaxDepth()) {
       Aliases.clear();
       Aliases.push_back(OriginalChain);
       return;
     }
 
     // Don't bother if we've been before.
     if (!Visited.insert(Chain.getNode()).second)
       continue;
 
     switch (Chain.getOpcode()) {
     case ISD::EntryToken:
       // Entry token is ideal chain operand, but handled in FindBetterChain.
       break;
 
     case ISD::LOAD:
     case ISD::STORE: {
       // Get alias information for Chain.
       bool IsOpLoad = isa<LoadSDNode>(Chain.getNode()) &&
           !cast<LSBaseSDNode>(Chain.getNode())->isVolatile();
 
       // If chain is alias then stop here.
       if (!(IsLoad && IsOpLoad) &&
           isAlias(cast<LSBaseSDNode>(N), cast<LSBaseSDNode>(Chain.getNode()))) {
         Aliases.push_back(Chain);
       } else {
         // Look further up the chain.
         Chains.push_back(Chain.getOperand(0));
         ++Depth;
       }
       break;
     }
 
     case ISD::TokenFactor:
       // We have to check each of the operands of the token factor for "small"
       // token factors, so we queue them up.  Adding the operands to the queue
       // (stack) in reverse order maintains the original order and increases the
       // likelihood that getNode will find a matching token factor (CSE.)
       if (Chain.getNumOperands() > 16) {
         Aliases.push_back(Chain);
         break;
       }
       for (unsigned n = Chain.getNumOperands(); n;)
         Chains.push_back(Chain.getOperand(--n));
       ++Depth;
       break;
 
     case ISD::CopyFromReg:
       // Forward past CopyFromReg.
       Chains.push_back(Chain.getOperand(0));
       ++Depth;
       break;
 
     default:
       // For all other instructions we will just have to take what we can get.
       Aliases.push_back(Chain);
       break;
     }
   }
 }
 
 /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
 /// (aliasing node.)
 SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   if (OptLevel == CodeGenOpt::None)
     return OldChain;
 
   // Ops for replacing token factor.
   SmallVector<SDValue, 8> Aliases;
 
   // Accumulate all the aliases to this node.
   GatherAllAliases(N, OldChain, Aliases);
 
   // If no operands then chain to entry token.
   if (Aliases.size() == 0)
     return DAG.getEntryNode();
 
   // If a single operand then chain to it.  We don't need to revisit it.
   if (Aliases.size() == 1)
     return Aliases[0];
 
   // Construct a custom tailored token factor.
   return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
 }
 
 // This function tries to collect a bunch of potentially interesting
 // nodes to improve the chains of, all at once. This might seem
 // redundant, as this function gets called when visiting every store
 // node, so why not let the work be done on each store as it's visited?
 //
 // I believe this is mainly important because MergeConsecutiveStores
 // is unable to deal with merging stores of different sizes, so unless
 // we improve the chains of all the potential candidates up-front
 // before running MergeConsecutiveStores, it might only see some of
 // the nodes that will eventually be candidates, and then not be able
 // to go from a partially-merged state to the desired final
 // fully-merged state.
 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
   if (OptLevel == CodeGenOpt::None)
     return false;
 
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
   BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
 
   // We must have a base and an offset.
   if (!BasePtr.getBase().getNode())
     return false;
 
   // Do not handle stores to undef base pointers.
   if (BasePtr.getBase().isUndef())
     return false;
 
   SmallVector<StoreSDNode *, 8> ChainedStores;
   ChainedStores.push_back(St);
 
   // Walk up the chain and look for nodes with offsets from the same
   // base pointer. Stop when reaching an instruction with a different kind
   // or instruction which has a different base pointer.
   StoreSDNode *Index = St;
   while (Index) {
     // If the chain has more than one use, then we can't reorder the mem ops.
     if (Index != St && !SDValue(Index, 0)->hasOneUse())
       break;
 
     if (Index->isVolatile() || Index->isIndexed())
       break;
 
     // Find the base pointer and offset for this memory node.
     BaseIndexOffset Ptr = BaseIndexOffset::match(Index, DAG);
 
     // Check that the base pointer is the same as the original one.
     if (!BasePtr.equalBaseIndex(Ptr, DAG))
       break;
 
     // Walk up the chain to find the next store node, ignoring any
     // intermediate loads. Any other kind of node will halt the loop.
     SDNode *NextInChain = Index->getChain().getNode();
     while (true) {
       if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
         // We found a store node. Use it for the next iteration.
         if (STn->isVolatile() || STn->isIndexed()) {
           Index = nullptr;
           break;
         }
         ChainedStores.push_back(STn);
         Index = STn;
         break;
       } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
         NextInChain = Ldn->getChain().getNode();
         continue;
       } else {
         Index = nullptr;
         break;
       }
     }// end while
   }
 
   // At this point, ChainedStores lists all of the Store nodes
   // reachable by iterating up through chain nodes matching the above
   // conditions.  For each such store identified, try to find an
   // earlier chain to attach the store to which won't violate the
   // required ordering.
   bool MadeChangeToSt = false;
   SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains;
 
   for (StoreSDNode *ChainedStore : ChainedStores) {
     SDValue Chain = ChainedStore->getChain();
     SDValue BetterChain = FindBetterChain(ChainedStore, Chain);
 
     if (Chain != BetterChain) {
       if (ChainedStore == St)
         MadeChangeToSt = true;
       BetterChains.push_back(std::make_pair(ChainedStore, BetterChain));
     }
   }
 
   // Do all replacements after finding the replacements to make to avoid making
   // the chains more complicated by introducing new TokenFactors.
   for (auto Replacement : BetterChains)
     replaceStoreChain(Replacement.first, Replacement.second);
 
   return MadeChangeToSt;
 }
 
 /// This is the entry point for the file.
 void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
                            CodeGenOpt::Level OptLevel) {
   /// This is the main entry point to this class.
   DAGCombiner(*this, AA, OptLevel).Run(Level);
 }
Index: head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp	(revision 344055)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp	(revision 344056)
@@ -1,11574 +1,11576 @@
 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the AArch64TargetLowering class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64ISelLowering.h"
 #include "AArch64CallingConvention.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64PerfectShuffle.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetCallingConv.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <bitset>
 #include <cassert>
 #include <cctype>
 #include <cstdint>
 #include <cstdlib>
 #include <iterator>
 #include <limits>
 #include <tuple>
 #include <utility>
 #include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-lower"
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
 
 static cl::opt<bool>
 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
                            cl::desc("Allow AArch64 SLI/SRI formation"),
                            cl::init(false));
 
 // FIXME: The necessary dtprel relocations don't seem to be supported
 // well in the GNU bfd and gold linkers at the moment. Therefore, by
 // default, for now, fall back to GeneralDynamic code generation.
 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
     "aarch64-elf-ldtls-generation", cl::Hidden,
     cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
     cl::init(false));
 
 static cl::opt<bool>
 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
                          cl::desc("Enable AArch64 logical imm instruction "
                                   "optimization"),
                          cl::init(true));
 
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                                              const AArch64Subtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
   // we have to make something up. Arbitrarily, choose ZeroOrOne.
   setBooleanContents(ZeroOrOneBooleanContent);
   // When comparing vectors the result sets the different elements in the
   // vector to all-one or all-zero.
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
 
   if (Subtarget->hasFPARMv8()) {
     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
   }
 
   if (Subtarget->hasNEON()) {
     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
     // Someone set us up the NEON.
     addDRTypeForNEON(MVT::v2f32);
     addDRTypeForNEON(MVT::v8i8);
     addDRTypeForNEON(MVT::v4i16);
     addDRTypeForNEON(MVT::v2i32);
     addDRTypeForNEON(MVT::v1i64);
     addDRTypeForNEON(MVT::v1f64);
     addDRTypeForNEON(MVT::v4f16);
 
     addQRTypeForNEON(MVT::v4f32);
     addQRTypeForNEON(MVT::v2f64);
     addQRTypeForNEON(MVT::v16i8);
     addQRTypeForNEON(MVT::v8i16);
     addQRTypeForNEON(MVT::v4i32);
     addQRTypeForNEON(MVT::v2i64);
     addQRTypeForNEON(MVT::v8f16);
   }
 
   // Compute derived properties from the register classes
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // Provide all sorts of operation actions
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
   setOperationAction(ISD::SETCC, MVT::i32, Custom);
   setOperationAction(ISD::SETCC, MVT::i64, Custom);
   setOperationAction(ISD::SETCC, MVT::f16, Custom);
   setOperationAction(ISD::SETCC, MVT::f32, Custom);
   setOperationAction(ISD::SETCC, MVT::f64, Custom);
   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
   setOperationAction(ISD::BR_CC, MVT::f16, Custom);
   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f16, Custom);
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
   setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
   setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
 
   setOperationAction(ISD::FREM, MVT::f32, Expand);
   setOperationAction(ISD::FREM, MVT::f64, Expand);
   setOperationAction(ISD::FREM, MVT::f80, Expand);
 
   setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
 
   // Custom lowering hooks are needed for XOR
   // to fold it into CSINC/CSINV.
   setOperationAction(ISD::XOR, MVT::i32, Custom);
   setOperationAction(ISD::XOR, MVT::i64, Custom);
 
   // Virtually no operation on f128 is legal, but LLVM can't expand them when
   // there's a valid register class, so we need custom operations in most cases.
   setOperationAction(ISD::FABS, MVT::f128, Expand);
   setOperationAction(ISD::FADD, MVT::f128, Custom);
   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
   setOperationAction(ISD::FCOS, MVT::f128, Expand);
   setOperationAction(ISD::FDIV, MVT::f128, Custom);
   setOperationAction(ISD::FMA, MVT::f128, Expand);
   setOperationAction(ISD::FMUL, MVT::f128, Custom);
   setOperationAction(ISD::FNEG, MVT::f128, Expand);
   setOperationAction(ISD::FPOW, MVT::f128, Expand);
   setOperationAction(ISD::FREM, MVT::f128, Expand);
   setOperationAction(ISD::FRINT, MVT::f128, Expand);
   setOperationAction(ISD::FSIN, MVT::f128, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
   setOperationAction(ISD::FSUB, MVT::f128, Custom);
   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
   setOperationAction(ISD::SETCC, MVT::f128, Custom);
   setOperationAction(ISD::BR_CC, MVT::f128, Custom);
   setOperationAction(ISD::SELECT, MVT::f128, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
 
   // Lowering for many of the conversions is actually specified by the non-f128
   // type. The LowerXXX function will be trivial when f128 isn't involved.
   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
 
   // Variable arguments.
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   setOperationAction(ISD::VAARG, MVT::Other, Custom);
   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
   // Variable-sized objects.
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
 
   if (Subtarget->isTargetWindows())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
   else
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 
   // Constant pool entries
   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 
   // BlockAddress
   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
 
   // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
   setOperationAction(ISD::ADDC, MVT::i32, Custom);
   setOperationAction(ISD::ADDE, MVT::i32, Custom);
   setOperationAction(ISD::SUBC, MVT::i32, Custom);
   setOperationAction(ISD::SUBE, MVT::i32, Custom);
   setOperationAction(ISD::ADDC, MVT::i64, Custom);
   setOperationAction(ISD::ADDE, MVT::i64, Custom);
   setOperationAction(ISD::SUBC, MVT::i64, Custom);
   setOperationAction(ISD::SUBE, MVT::i64, Custom);
 
   // AArch64 lacks both left-rotate and popcount instructions.
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
   setOperationAction(ISD::ROTL, MVT::i64, Expand);
   for (MVT VT : MVT::vector_valuetypes()) {
     setOperationAction(ISD::ROTL, VT, Expand);
     setOperationAction(ISD::ROTR, VT, Expand);
   }
 
   // AArch64 doesn't have {U|S}MUL_LOHI.
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
   for (MVT VT : MVT::vector_valuetypes()) {
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::UDIVREM, VT, Expand);
   }
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::SREM, MVT::i64, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
   setOperationAction(ISD::UREM, MVT::i64, Expand);
 
   // Custom lower Add/Sub/Mul with overflow.
   setOperationAction(ISD::SADDO, MVT::i32, Custom);
   setOperationAction(ISD::SADDO, MVT::i64, Custom);
   setOperationAction(ISD::UADDO, MVT::i32, Custom);
   setOperationAction(ISD::UADDO, MVT::i64, Custom);
   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
   setOperationAction(ISD::SSUBO, MVT::i64, Custom);
   setOperationAction(ISD::USUBO, MVT::i32, Custom);
   setOperationAction(ISD::USUBO, MVT::i64, Custom);
   setOperationAction(ISD::SMULO, MVT::i32, Custom);
   setOperationAction(ISD::SMULO, MVT::i64, Custom);
   setOperationAction(ISD::UMULO, MVT::i32, Custom);
   setOperationAction(ISD::UMULO, MVT::i64, Custom);
 
   setOperationAction(ISD::FSIN, MVT::f32, Expand);
   setOperationAction(ISD::FSIN, MVT::f64, Expand);
   setOperationAction(ISD::FCOS, MVT::f32, Expand);
   setOperationAction(ISD::FCOS, MVT::f64, Expand);
   setOperationAction(ISD::FPOW, MVT::f32, Expand);
   setOperationAction(ISD::FPOW, MVT::f64, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
   if (Subtarget->hasFullFP16())
     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
   else
     setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
 
   setOperationAction(ISD::FREM,    MVT::f16,   Promote);
   setOperationAction(ISD::FREM,    MVT::v4f16, Promote);
   setOperationAction(ISD::FREM,    MVT::v8f16, Promote);
   setOperationAction(ISD::FPOW,    MVT::f16,   Promote);
   setOperationAction(ISD::FPOW,    MVT::v4f16, Promote);
   setOperationAction(ISD::FPOW,    MVT::v8f16, Promote);
   setOperationAction(ISD::FPOWI,   MVT::f16,   Promote);
   setOperationAction(ISD::FCOS,    MVT::f16,   Promote);
   setOperationAction(ISD::FCOS,    MVT::v4f16, Promote);
   setOperationAction(ISD::FCOS,    MVT::v8f16, Promote);
   setOperationAction(ISD::FSIN,    MVT::f16,   Promote);
   setOperationAction(ISD::FSIN,    MVT::v4f16, Promote);
   setOperationAction(ISD::FSIN,    MVT::v8f16, Promote);
   setOperationAction(ISD::FSINCOS, MVT::f16,   Promote);
   setOperationAction(ISD::FSINCOS, MVT::v4f16, Promote);
   setOperationAction(ISD::FSINCOS, MVT::v8f16, Promote);
   setOperationAction(ISD::FEXP,    MVT::f16,   Promote);
   setOperationAction(ISD::FEXP,    MVT::v4f16, Promote);
   setOperationAction(ISD::FEXP,    MVT::v8f16, Promote);
   setOperationAction(ISD::FEXP2,   MVT::f16,   Promote);
   setOperationAction(ISD::FEXP2,   MVT::v4f16, Promote);
   setOperationAction(ISD::FEXP2,   MVT::v8f16, Promote);
   setOperationAction(ISD::FLOG,    MVT::f16,   Promote);
   setOperationAction(ISD::FLOG,    MVT::v4f16, Promote);
   setOperationAction(ISD::FLOG,    MVT::v8f16, Promote);
   setOperationAction(ISD::FLOG2,   MVT::f16,   Promote);
   setOperationAction(ISD::FLOG2,   MVT::v4f16, Promote);
   setOperationAction(ISD::FLOG2,   MVT::v8f16, Promote);
   setOperationAction(ISD::FLOG10,  MVT::f16,   Promote);
   setOperationAction(ISD::FLOG10,  MVT::v4f16, Promote);
   setOperationAction(ISD::FLOG10,  MVT::v8f16, Promote);
 
   if (!Subtarget->hasFullFP16()) {
     setOperationAction(ISD::SELECT,      MVT::f16,  Promote);
     setOperationAction(ISD::SELECT_CC,   MVT::f16,  Promote);
     setOperationAction(ISD::SETCC,       MVT::f16,  Promote);
     setOperationAction(ISD::BR_CC,       MVT::f16,  Promote);
     setOperationAction(ISD::FADD,        MVT::f16,  Promote);
     setOperationAction(ISD::FSUB,        MVT::f16,  Promote);
     setOperationAction(ISD::FMUL,        MVT::f16,  Promote);
     setOperationAction(ISD::FDIV,        MVT::f16,  Promote);
     setOperationAction(ISD::FMA,         MVT::f16,  Promote);
     setOperationAction(ISD::FNEG,        MVT::f16,  Promote);
     setOperationAction(ISD::FABS,        MVT::f16,  Promote);
     setOperationAction(ISD::FCEIL,       MVT::f16,  Promote);
     setOperationAction(ISD::FSQRT,       MVT::f16,  Promote);
     setOperationAction(ISD::FFLOOR,      MVT::f16,  Promote);
     setOperationAction(ISD::FNEARBYINT,  MVT::f16,  Promote);
     setOperationAction(ISD::FRINT,       MVT::f16,  Promote);
     setOperationAction(ISD::FROUND,      MVT::f16,  Promote);
     setOperationAction(ISD::FTRUNC,      MVT::f16,  Promote);
     setOperationAction(ISD::FMINNUM,     MVT::f16,  Promote);
     setOperationAction(ISD::FMAXNUM,     MVT::f16,  Promote);
     setOperationAction(ISD::FMINNAN,     MVT::f16,  Promote);
     setOperationAction(ISD::FMAXNAN,     MVT::f16,  Promote);
 
     // promote v4f16 to v4f32 when that is known to be safe.
     setOperationAction(ISD::FADD,        MVT::v4f16, Promote);
     setOperationAction(ISD::FSUB,        MVT::v4f16, Promote);
     setOperationAction(ISD::FMUL,        MVT::v4f16, Promote);
     setOperationAction(ISD::FDIV,        MVT::v4f16, Promote);
     setOperationAction(ISD::FP_EXTEND,   MVT::v4f16, Promote);
     setOperationAction(ISD::FP_ROUND,    MVT::v4f16, Promote);
     AddPromotedToType(ISD::FADD,         MVT::v4f16, MVT::v4f32);
     AddPromotedToType(ISD::FSUB,         MVT::v4f16, MVT::v4f32);
     AddPromotedToType(ISD::FMUL,         MVT::v4f16, MVT::v4f32);
     AddPromotedToType(ISD::FDIV,         MVT::v4f16, MVT::v4f32);
     AddPromotedToType(ISD::FP_EXTEND,    MVT::v4f16, MVT::v4f32);
     AddPromotedToType(ISD::FP_ROUND,     MVT::v4f16, MVT::v4f32);
 
     setOperationAction(ISD::FABS,        MVT::v4f16, Expand);
     setOperationAction(ISD::FNEG,        MVT::v4f16, Expand);
     setOperationAction(ISD::FROUND,      MVT::v4f16, Expand);
     setOperationAction(ISD::FMA,         MVT::v4f16, Expand);
     setOperationAction(ISD::SETCC,       MVT::v4f16, Expand);
     setOperationAction(ISD::BR_CC,       MVT::v4f16, Expand);
     setOperationAction(ISD::SELECT,      MVT::v4f16, Expand);
     setOperationAction(ISD::SELECT_CC,   MVT::v4f16, Expand);
     setOperationAction(ISD::FTRUNC,      MVT::v4f16, Expand);
     setOperationAction(ISD::FCOPYSIGN,   MVT::v4f16, Expand);
     setOperationAction(ISD::FFLOOR,      MVT::v4f16, Expand);
     setOperationAction(ISD::FCEIL,       MVT::v4f16, Expand);
     setOperationAction(ISD::FRINT,       MVT::v4f16, Expand);
     setOperationAction(ISD::FNEARBYINT,  MVT::v4f16, Expand);
     setOperationAction(ISD::FSQRT,       MVT::v4f16, Expand);
 
     setOperationAction(ISD::FABS,        MVT::v8f16, Expand);
     setOperationAction(ISD::FADD,        MVT::v8f16, Expand);
     setOperationAction(ISD::FCEIL,       MVT::v8f16, Expand);
     setOperationAction(ISD::FCOPYSIGN,   MVT::v8f16, Expand);
     setOperationAction(ISD::FDIV,        MVT::v8f16, Expand);
     setOperationAction(ISD::FFLOOR,      MVT::v8f16, Expand);
     setOperationAction(ISD::FMA,         MVT::v8f16, Expand);
     setOperationAction(ISD::FMUL,        MVT::v8f16, Expand);
     setOperationAction(ISD::FNEARBYINT,  MVT::v8f16, Expand);
     setOperationAction(ISD::FNEG,        MVT::v8f16, Expand);
     setOperationAction(ISD::FROUND,      MVT::v8f16, Expand);
     setOperationAction(ISD::FRINT,       MVT::v8f16, Expand);
     setOperationAction(ISD::FSQRT,       MVT::v8f16, Expand);
     setOperationAction(ISD::FSUB,        MVT::v8f16, Expand);
     setOperationAction(ISD::FTRUNC,      MVT::v8f16, Expand);
     setOperationAction(ISD::SETCC,       MVT::v8f16, Expand);
     setOperationAction(ISD::BR_CC,       MVT::v8f16, Expand);
     setOperationAction(ISD::SELECT,      MVT::v8f16, Expand);
     setOperationAction(ISD::SELECT_CC,   MVT::v8f16, Expand);
     setOperationAction(ISD::FP_EXTEND,   MVT::v8f16, Expand);
   }
 
   // AArch64 has implementations of a lot of rounding-like FP operations.
   for (MVT Ty : {MVT::f32, MVT::f64}) {
     setOperationAction(ISD::FFLOOR, Ty, Legal);
     setOperationAction(ISD::FNEARBYINT, Ty, Legal);
     setOperationAction(ISD::FCEIL, Ty, Legal);
     setOperationAction(ISD::FRINT, Ty, Legal);
     setOperationAction(ISD::FTRUNC, Ty, Legal);
     setOperationAction(ISD::FROUND, Ty, Legal);
     setOperationAction(ISD::FMINNUM, Ty, Legal);
     setOperationAction(ISD::FMAXNUM, Ty, Legal);
     setOperationAction(ISD::FMINNAN, Ty, Legal);
     setOperationAction(ISD::FMAXNAN, Ty, Legal);
   }
 
   if (Subtarget->hasFullFP16()) {
     setOperationAction(ISD::FNEARBYINT, MVT::f16, Legal);
     setOperationAction(ISD::FFLOOR,  MVT::f16, Legal);
     setOperationAction(ISD::FCEIL,   MVT::f16, Legal);
     setOperationAction(ISD::FRINT,   MVT::f16, Legal);
     setOperationAction(ISD::FTRUNC,  MVT::f16, Legal);
     setOperationAction(ISD::FROUND,  MVT::f16, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
     setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
   }
 
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
   setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
 
   setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
   setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
 
   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
   // This requires the Performance Monitors extension.
   if (Subtarget->hasPerfMon())
     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
 
   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
     // Issue __sincos_stret if available.
     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   } else {
     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   }
 
   // Make floating-point constants legal for the large code model, so they don't
   // become loads from the constant pool.
   if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
     setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
     setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
   }
 
   // AArch64 does not have floating-point extending loads, i1 sign-extending
   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
   for (MVT VT : MVT::fp_valuetypes()) {
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
   }
   for (MVT VT : MVT::integer_valuetypes())
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
 
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f128, MVT::f80, Expand);
   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 
   setOperationAction(ISD::BITCAST, MVT::i16, Custom);
   setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 
   // Indexed loads and stores are supported.
   for (unsigned im = (unsigned)ISD::PRE_INC;
        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
     setIndexedLoadAction(im, MVT::i8, Legal);
     setIndexedLoadAction(im, MVT::i16, Legal);
     setIndexedLoadAction(im, MVT::i32, Legal);
     setIndexedLoadAction(im, MVT::i64, Legal);
     setIndexedLoadAction(im, MVT::f64, Legal);
     setIndexedLoadAction(im, MVT::f32, Legal);
     setIndexedLoadAction(im, MVT::f16, Legal);
     setIndexedStoreAction(im, MVT::i8, Legal);
     setIndexedStoreAction(im, MVT::i16, Legal);
     setIndexedStoreAction(im, MVT::i32, Legal);
     setIndexedStoreAction(im, MVT::i64, Legal);
     setIndexedStoreAction(im, MVT::f64, Legal);
     setIndexedStoreAction(im, MVT::f32, Legal);
     setIndexedStoreAction(im, MVT::f16, Legal);
   }
 
   // Trap.
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
   // We combine OR nodes for bitfield operations.
   setTargetDAGCombine(ISD::OR);
 
   // Vector add and sub nodes may conceal a high-half opportunity.
   // Also, try to fold ADD into CSINC/CSINV..
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
 
   setTargetDAGCombine(ISD::FP_TO_SINT);
   setTargetDAGCombine(ISD::FP_TO_UINT);
   setTargetDAGCombine(ISD::FDIV);
 
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::CONCAT_VECTORS);
   setTargetDAGCombine(ISD::STORE);
   if (Subtarget->supportsAddressTopByteIgnored())
     setTargetDAGCombine(ISD::LOAD);
 
   setTargetDAGCombine(ISD::MUL);
 
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::VSELECT);
 
   setTargetDAGCombine(ISD::INTRINSIC_VOID);
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
   setTargetDAGCombine(ISD::GlobalAddress);
 
   // In case of strict alignment, avoid an excessive number of byte wide stores.
   MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemset = Subtarget->requiresStrictAlign()
                        ? MaxStoresPerMemsetOptSize : 32;
 
   MaxGluedStoresPerMemcpy = 4;
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
                        ? MaxStoresPerMemcpyOptSize : 16;
 
   MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
 
   setStackPointerRegisterToSaveRestore(AArch64::SP);
 
   setSchedulingPreference(Sched::Hybrid);
 
   EnableExtLdPromotion = true;
 
   // Set required alignment.
   setMinFunctionAlignment(2);
   // Set preferred alignments.
   setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
   setPrefLoopAlignment(STI.getPrefLoopAlignment());
 
   // Only change the limit for entries in a jump table if specified by
   // the subtarget, but not at the command line.
   unsigned MaxJT = STI.getMaximumJumpTableSize();
   if (MaxJT && getMaximumJumpTableSize() == 0)
     setMaximumJumpTableSize(MaxJT);
 
   setHasExtractBitsInsn(true);
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
   if (Subtarget->hasNEON()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
     // silliness like this:
     setOperationAction(ISD::FABS, MVT::v1f64, Expand);
     setOperationAction(ISD::FADD, MVT::v1f64, Expand);
     setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
     setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
     setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
     setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
     setOperationAction(ISD::FMA, MVT::v1f64, Expand);
     setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
     setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
     setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
     setOperationAction(ISD::FREM, MVT::v1f64, Expand);
     setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
     setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
     setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
     setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
     setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
     setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
     setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
     setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
     setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
     setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
     setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
 
     setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
     setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
     setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
     setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
     setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
 
     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 
     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
     // elements smaller than i32, so promote the input to i32 first.
     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
     // i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
     // -> v8f16 conversions.
     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
     setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
     setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
     setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
     // Or, direct i32 -> f16 vector conversion.  Set it so custom, so the
     // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
 
     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
 
     setOperationAction(ISD::CTTZ,       MVT::v2i8,  Expand);
     setOperationAction(ISD::CTTZ,       MVT::v4i16, Expand);
     setOperationAction(ISD::CTTZ,       MVT::v2i32, Expand);
     setOperationAction(ISD::CTTZ,       MVT::v1i64, Expand);
     setOperationAction(ISD::CTTZ,       MVT::v16i8, Expand);
     setOperationAction(ISD::CTTZ,       MVT::v8i16, Expand);
     setOperationAction(ISD::CTTZ,       MVT::v4i32, Expand);
     setOperationAction(ISD::CTTZ,       MVT::v2i64, Expand);
 
     // AArch64 doesn't have MUL.2d:
     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
     // Custom handling for some quad-vector types to detect MULL.
     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
 
     // Vector reductions
     for (MVT VT : MVT::integer_valuetypes()) {
       setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
     }
     for (MVT VT : MVT::fp_valuetypes()) {
       setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
     }
 
     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     // Likewise, narrowing and extending vector loads/stores aren't handled
     // directly.
     for (MVT VT : MVT::vector_valuetypes()) {
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 
       if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
         setOperationAction(ISD::MULHS, VT, Custom);
         setOperationAction(ISD::MULHU, VT, Custom);
       } else {
         setOperationAction(ISD::MULHS, VT, Expand);
         setOperationAction(ISD::MULHU, VT, Expand);
       }
       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
       setOperationAction(ISD::BSWAP, VT, Expand);
 
       for (MVT InnerVT : MVT::vector_valuetypes()) {
         setTruncStoreAction(VT, InnerVT, Expand);
         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
       }
     }
 
     // AArch64 has implementations of a lot of rounding-like FP operations.
     for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
       setOperationAction(ISD::FFLOOR, Ty, Legal);
       setOperationAction(ISD::FNEARBYINT, Ty, Legal);
       setOperationAction(ISD::FCEIL, Ty, Legal);
       setOperationAction(ISD::FRINT, Ty, Legal);
       setOperationAction(ISD::FTRUNC, Ty, Legal);
       setOperationAction(ISD::FROUND, Ty, Legal);
     }
 
     setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
   }
 
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
 }
 
 void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   assert(VT.isVector() && "VT should be a vector type");
 
   if (VT.isFloatingPoint()) {
     MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
     setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
     setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
   }
 
   // Mark vector float intrinsics as expand.
   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
     setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FCOS, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FLOG, VT, Expand);
     setOperationAction(ISD::FLOG2, VT, Expand);
     setOperationAction(ISD::FLOG10, VT, Expand);
     setOperationAction(ISD::FEXP, VT, Expand);
     setOperationAction(ISD::FEXP2, VT, Expand);
 
     // But we do support custom-lowering for FCOPYSIGN.
     setOperationAction(ISD::FCOPYSIGN, VT, Custom);
   }
 
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
   setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
   setOperationAction(ISD::SRA, VT, Custom);
   setOperationAction(ISD::SRL, VT, Custom);
   setOperationAction(ISD::SHL, VT, Custom);
   setOperationAction(ISD::AND, VT, Custom);
   setOperationAction(ISD::OR, VT, Custom);
   setOperationAction(ISD::SETCC, VT, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
 
   setOperationAction(ISD::SELECT, VT, Expand);
   setOperationAction(ISD::SELECT_CC, VT, Expand);
   setOperationAction(ISD::VSELECT, VT, Expand);
   for (MVT InnerVT : MVT::all_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
 
   // CNT supports only B element sizes.
   if (VT != MVT::v8i8 && VT != MVT::v16i8)
     setOperationAction(ISD::CTPOP, VT, Expand);
 
   setOperationAction(ISD::UDIV, VT, Expand);
   setOperationAction(ISD::SDIV, VT, Expand);
   setOperationAction(ISD::UREM, VT, Expand);
   setOperationAction(ISD::SREM, VT, Expand);
   setOperationAction(ISD::FREM, VT, Expand);
 
   setOperationAction(ISD::FP_TO_SINT, VT, Custom);
   setOperationAction(ISD::FP_TO_UINT, VT, Custom);
 
   if (!VT.isFloatingPoint())
     setOperationAction(ISD::ABS, VT, Legal);
 
   // [SU][MIN|MAX] are available for all NEON types apart from i64.
   if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
     for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
       setOperationAction(Opcode, VT, Legal);
 
   // F[MIN|MAX][NUM|NAN] are available for all FP NEON types.
   if (VT.isFloatingPoint() &&
       (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
     for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
                             ISD::FMINNUM, ISD::FMAXNUM})
       setOperationAction(Opcode, VT, Legal);
 
   if (Subtarget->isLittleEndian()) {
     for (unsigned im = (unsigned)ISD::PRE_INC;
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
       setIndexedLoadAction(im, VT, Legal);
       setIndexedStoreAction(im, VT, Legal);
     }
   }
 }
 
 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
   addRegisterClass(VT, &AArch64::FPR64RegClass);
   addTypeForNEON(VT, MVT::v2i32);
 }
 
 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
   addRegisterClass(VT, &AArch64::FPR128RegClass);
   addTypeForNEON(VT, MVT::v4i32);
 }
 
 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
                                               EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
 }
 
 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
                                const APInt &Demanded,
                                TargetLowering::TargetLoweringOpt &TLO,
                                unsigned NewOpc) {
   uint64_t OldImm = Imm, NewImm, Enc;
   uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
 
   // Return if the immediate is already all zeros, all ones, a bimm32 or a
   // bimm64.
   if (Imm == 0 || Imm == Mask ||
       AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
     return false;
 
   unsigned EltSize = Size;
   uint64_t DemandedBits = Demanded.getZExtValue();
 
   // Clear bits that are not demanded.
   Imm &= DemandedBits;
 
   while (true) {
     // The goal here is to set the non-demanded bits in a way that minimizes
     // the number of switching between 0 and 1. In order to achieve this goal,
     // we set the non-demanded bits to the value of the preceding demanded bits.
     // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
     // non-demanded bit), we copy bit0 (1) to the least significant 'x',
     // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
     // The final result is 0b11000011.
     uint64_t NonDemandedBits = ~DemandedBits;
     uint64_t InvertedImm = ~Imm & DemandedBits;
     uint64_t RotatedImm =
         ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
         NonDemandedBits;
     uint64_t Sum = RotatedImm + NonDemandedBits;
     bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
     uint64_t Ones = (Sum + Carry) & NonDemandedBits;
     NewImm = (Imm | Ones) & Mask;
 
     // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
     // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
     // we halve the element size and continue the search.
     if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
       break;
 
     // We cannot shrink the element size any further if it is 2-bits.
     if (EltSize == 2)
       return false;
 
     EltSize /= 2;
     Mask >>= EltSize;
     uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
 
     // Return if there is mismatch in any of the demanded bits of Imm and Hi.
     if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
       return false;
 
     // Merge the upper and lower halves of Imm and DemandedBits.
     Imm |= Hi;
     DemandedBits |= DemandedBitsHi;
   }
 
   ++NumOptimizedImms;
 
   // Replicate the element across the register width.
   while (EltSize < Size) {
     NewImm |= NewImm << EltSize;
     EltSize *= 2;
   }
 
   (void)OldImm;
   assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
          "demanded bits should never be altered");
   assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
 
   // Create the new constant immediate node.
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   SDValue New;
 
   // If the new constant immediate is all-zeros or all-ones, let the target
   // independent DAG combine optimize this node.
   if (NewImm == 0 || NewImm == OrigMask) {
     New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
                           TLO.DAG.getConstant(NewImm, DL, VT));
   // Otherwise, create a machine node so that target independent DAG combine
   // doesn't undo this optimization.
   } else {
     Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
     SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
     New = SDValue(
         TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
   }
 
   return TLO.CombineTo(Op, New);
 }
 
 bool AArch64TargetLowering::targetShrinkDemandedConstant(
     SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
   // Delay this optimization to as late as possible.
   if (!TLO.LegalOps)
     return false;
 
   if (!EnableOptimizeLogicalImm)
     return false;
 
   EVT VT = Op.getValueType();
   if (VT.isVector())
     return false;
 
   unsigned Size = VT.getSizeInBits();
   assert((Size == 32 || Size == 64) &&
          "i32 or i64 is expected after legalization.");
 
   // Exit early if we demand all bits.
   if (Demanded.countPopulation() == Size)
     return false;
 
   unsigned NewOpc;
   switch (Op.getOpcode()) {
   default:
     return false;
   case ISD::AND:
     NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
     break;
   case ISD::OR:
     NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
     break;
   case ISD::XOR:
     NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
     break;
   }
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   if (!C)
     return false;
   uint64_t Imm = C->getZExtValue();
   return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
 }
 
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
 /// Mask are known to be either zero or one and return them Known.
 void AArch64TargetLowering::computeKnownBitsForTargetNode(
     const SDValue Op, KnownBits &Known,
     const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
   switch (Op.getOpcode()) {
   default:
     break;
   case AArch64ISD::CSEL: {
     KnownBits Known2;
     DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
     DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
     Known.Zero &= Known2.Zero;
     Known.One &= Known2.One;
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
     switch (IntID) {
     default: return;
     case Intrinsic::aarch64_ldaxr:
     case Intrinsic::aarch64_ldxr: {
       unsigned BitWidth = Known.getBitWidth();
       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
       unsigned MemBits = VT.getScalarSizeInBits();
       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
       return;
     }
     }
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_VOID: {
     unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     switch (IntNo) {
     default:
       break;
     case Intrinsic::aarch64_neon_umaxv:
     case Intrinsic::aarch64_neon_uminv: {
       // Figure out the datatype of the vector operand. The UMINV instruction
       // will zero extend the result, so we can mark as known zero all the
       // bits larger than the element datatype. 32-bit or larget doesn't need
       // this as those are legal types and will be handled by isel directly.
       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
       unsigned BitWidth = Known.getBitWidth();
       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
         assert(BitWidth >= 8 && "Unexpected width!");
         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
         Known.Zero |= Mask;
       } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
         assert(BitWidth >= 16 && "Unexpected width!");
         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
         Known.Zero |= Mask;
       }
       break;
     } break;
     }
   }
   }
 }
 
 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
                                                   EVT) const {
   return MVT::i64;
 }
 
 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                            unsigned AddrSpace,
                                                            unsigned Align,
                                                            bool *Fast) const {
   if (Subtarget->requiresStrictAlign())
     return false;
 
   if (Fast) {
     // Some CPUs are fine with unaligned stores except for 128-bit ones.
     *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
             // See comments in performSTORECombine() for more details about
             // these conditions.
 
             // Code that uses clang vector extensions can mark that it
             // wants unaligned accesses to be treated as fast by
             // underspecifying alignment to be 1 or 2.
             Align <= 2 ||
 
             // Disregard v2i64. Memcpy lowering produces those and splitting
             // them regresses performance on micro-benchmarks and olden/bh.
             VT == MVT::v2i64;
   }
   return true;
 }
 
 FastISel *
 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                       const TargetLibraryInfo *libInfo) const {
   return AArch64::createFastISel(funcInfo, libInfo);
 }
 
 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((AArch64ISD::NodeType)Opcode) {
   case AArch64ISD::FIRST_NUMBER:      break;
   case AArch64ISD::CALL:              return "AArch64ISD::CALL";
   case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
   case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
   case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
   case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
   case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
   case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
   case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
   case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
   case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
   case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
   case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
   case AArch64ISD::TLSDESC_CALLSEQ:   return "AArch64ISD::TLSDESC_CALLSEQ";
   case AArch64ISD::ADC:               return "AArch64ISD::ADC";
   case AArch64ISD::SBC:               return "AArch64ISD::SBC";
   case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
   case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
   case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
   case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
   case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
   case AArch64ISD::CCMP:              return "AArch64ISD::CCMP";
   case AArch64ISD::CCMN:              return "AArch64ISD::CCMN";
   case AArch64ISD::FCCMP:             return "AArch64ISD::FCCMP";
   case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
   case AArch64ISD::DUP:               return "AArch64ISD::DUP";
   case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
   case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
   case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
   case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
   case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
   case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
   case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
   case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
   case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
   case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
   case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
   case AArch64ISD::BICi:              return "AArch64ISD::BICi";
   case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
   case AArch64ISD::BSL:               return "AArch64ISD::BSL";
   case AArch64ISD::NEG:               return "AArch64ISD::NEG";
   case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
   case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
   case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
   case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
   case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
   case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
   case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
   case AArch64ISD::REV16:             return "AArch64ISD::REV16";
   case AArch64ISD::REV32:             return "AArch64ISD::REV32";
   case AArch64ISD::REV64:             return "AArch64ISD::REV64";
   case AArch64ISD::EXT:               return "AArch64ISD::EXT";
   case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
   case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
   case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
   case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
   case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
   case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
   case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
   case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
   case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
   case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
   case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
   case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
   case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
   case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
   case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
   case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
   case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
   case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
   case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
   case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
   case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
   case AArch64ISD::SADDV:             return "AArch64ISD::SADDV";
   case AArch64ISD::UADDV:             return "AArch64ISD::UADDV";
   case AArch64ISD::SMINV:             return "AArch64ISD::SMINV";
   case AArch64ISD::UMINV:             return "AArch64ISD::UMINV";
   case AArch64ISD::SMAXV:             return "AArch64ISD::SMAXV";
   case AArch64ISD::UMAXV:             return "AArch64ISD::UMAXV";
   case AArch64ISD::NOT:               return "AArch64ISD::NOT";
   case AArch64ISD::BIT:               return "AArch64ISD::BIT";
   case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
   case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
   case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
   case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
   case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
   case AArch64ISD::PREFETCH:          return "AArch64ISD::PREFETCH";
   case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
   case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
   case AArch64ISD::NVCAST:            return "AArch64ISD::NVCAST";
   case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
   case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
   case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
   case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
   case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
   case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
   case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
   case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
   case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
   case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
   case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
   case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
   case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
   case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
   case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
   case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
   case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
   case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
   case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
   case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
   case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
   case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
   case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
   case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
   case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
   case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
   case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
   case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
   case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
   case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
   case AArch64ISD::FRECPE:            return "AArch64ISD::FRECPE";
   case AArch64ISD::FRECPS:            return "AArch64ISD::FRECPS";
   case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
   case AArch64ISD::FRSQRTS:           return "AArch64ISD::FRSQRTS";
   }
   return nullptr;
 }
 
 MachineBasicBlock *
 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
                                     MachineBasicBlock *MBB) const {
   // We materialise the F128CSEL pseudo-instruction as some control flow and a
   // phi node:
 
   // OrigBB:
   //     [... previous instrs leading to comparison ...]
   //     b.ne TrueBB
   //     b EndBB
   // TrueBB:
   //     ; Fallthrough
   // EndBB:
   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
 
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction::iterator It = ++MBB->getIterator();
 
   unsigned DestReg = MI.getOperand(0).getReg();
   unsigned IfTrueReg = MI.getOperand(1).getReg();
   unsigned IfFalseReg = MI.getOperand(2).getReg();
   unsigned CondCode = MI.getOperand(3).getImm();
   bool NZCVKilled = MI.getOperand(4).isKill();
 
   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MF->insert(It, TrueBB);
   MF->insert(It, EndBB);
 
   // Transfer rest of current basic-block to EndBB
   EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
                 MBB->end());
   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
   BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
   MBB->addSuccessor(TrueBB);
   MBB->addSuccessor(EndBB);
 
   // TrueBB falls through to the end.
   TrueBB->addSuccessor(EndBB);
 
   if (!NZCVKilled) {
     TrueBB->addLiveIn(AArch64::NZCV);
     EndBB->addLiveIn(AArch64::NZCV);
   }
 
   BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
       .addReg(IfTrueReg)
       .addMBB(TrueBB)
       .addReg(IfFalseReg)
       .addMBB(MBB);
 
   MI.eraseFromParent();
   return EndBB;
 }
 
 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
   switch (MI.getOpcode()) {
   default:
 #ifndef NDEBUG
     MI.dump();
 #endif
     llvm_unreachable("Unexpected instruction for custom inserter!");
 
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
 
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
   }
 }
 
 //===----------------------------------------------------------------------===//
 // AArch64 Lowering private implementation.
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // Lowering Code
 //===----------------------------------------------------------------------===//
 
 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
 /// CC
 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
   switch (CC) {
   default:
     llvm_unreachable("Unknown condition code!");
   case ISD::SETNE:
     return AArch64CC::NE;
   case ISD::SETEQ:
     return AArch64CC::EQ;
   case ISD::SETGT:
     return AArch64CC::GT;
   case ISD::SETGE:
     return AArch64CC::GE;
   case ISD::SETLT:
     return AArch64CC::LT;
   case ISD::SETLE:
     return AArch64CC::LE;
   case ISD::SETUGT:
     return AArch64CC::HI;
   case ISD::SETUGE:
     return AArch64CC::HS;
   case ISD::SETULT:
     return AArch64CC::LO;
   case ISD::SETULE:
     return AArch64CC::LS;
   }
 }
 
 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
 static void changeFPCCToAArch64CC(ISD::CondCode CC,
                                   AArch64CC::CondCode &CondCode,
                                   AArch64CC::CondCode &CondCode2) {
   CondCode2 = AArch64CC::AL;
   switch (CC) {
   default:
     llvm_unreachable("Unknown FP condition!");
   case ISD::SETEQ:
   case ISD::SETOEQ:
     CondCode = AArch64CC::EQ;
     break;
   case ISD::SETGT:
   case ISD::SETOGT:
     CondCode = AArch64CC::GT;
     break;
   case ISD::SETGE:
   case ISD::SETOGE:
     CondCode = AArch64CC::GE;
     break;
   case ISD::SETOLT:
     CondCode = AArch64CC::MI;
     break;
   case ISD::SETOLE:
     CondCode = AArch64CC::LS;
     break;
   case ISD::SETONE:
     CondCode = AArch64CC::MI;
     CondCode2 = AArch64CC::GT;
     break;
   case ISD::SETO:
     CondCode = AArch64CC::VC;
     break;
   case ISD::SETUO:
     CondCode = AArch64CC::VS;
     break;
   case ISD::SETUEQ:
     CondCode = AArch64CC::EQ;
     CondCode2 = AArch64CC::VS;
     break;
   case ISD::SETUGT:
     CondCode = AArch64CC::HI;
     break;
   case ISD::SETUGE:
     CondCode = AArch64CC::PL;
     break;
   case ISD::SETLT:
   case ISD::SETULT:
     CondCode = AArch64CC::LT;
     break;
   case ISD::SETLE:
   case ISD::SETULE:
     CondCode = AArch64CC::LE;
     break;
   case ISD::SETNE:
   case ISD::SETUNE:
     CondCode = AArch64CC::NE;
     break;
   }
 }
 
 /// Convert a DAG fp condition code to an AArch64 CC.
 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
 /// should be AND'ed instead of OR'ed.
 static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
                                      AArch64CC::CondCode &CondCode,
                                      AArch64CC::CondCode &CondCode2) {
   CondCode2 = AArch64CC::AL;
   switch (CC) {
   default:
     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
     assert(CondCode2 == AArch64CC::AL);
     break;
   case ISD::SETONE:
     // (a one b)
     // == ((a olt b) || (a ogt b))
     // == ((a ord b) && (a une b))
     CondCode = AArch64CC::VC;
     CondCode2 = AArch64CC::NE;
     break;
   case ISD::SETUEQ:
     // (a ueq b)
     // == ((a uno b) || (a oeq b))
     // == ((a ule b) && (a uge b))
     CondCode = AArch64CC::PL;
     CondCode2 = AArch64CC::LE;
     break;
   }
 }
 
 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
 /// CC usable with the vector instructions. Fewer operations are available
 /// without a real NZCV register, so we have to use less efficient combinations
 /// to get the same effect.
 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
                                         AArch64CC::CondCode &CondCode,
                                         AArch64CC::CondCode &CondCode2,
                                         bool &Invert) {
   Invert = false;
   switch (CC) {
   default:
     // Mostly the scalar mappings work fine.
     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
     break;
   case ISD::SETUO:
     Invert = true;
     LLVM_FALLTHROUGH;
   case ISD::SETO:
     CondCode = AArch64CC::MI;
     CondCode2 = AArch64CC::GE;
     break;
   case ISD::SETUEQ:
   case ISD::SETULT:
   case ISD::SETULE:
   case ISD::SETUGT:
   case ISD::SETUGE:
     // All of the compare-mask comparisons are ordered, but we can switch
     // between the two by a double inversion. E.g. ULE == !OGT.
     Invert = true;
     changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
     break;
   }
 }
 
 static bool isLegalArithImmed(uint64_t C) {
   // Matches AArch64DAGToDAGISel::SelectArithImmed().
   bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
   LLVM_DEBUG(dbgs() << "Is imm " << C
                     << " legal: " << (IsLegal ? "yes\n" : "no\n"));
   return IsLegal;
 }
 
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               const SDLoc &dl, SelectionDAG &DAG) {
   EVT VT = LHS.getValueType();
   const bool FullFP16 =
     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
 
   if (VT.isFloatingPoint()) {
     assert(VT != MVT::f128);
     if (VT == MVT::f16 && !FullFP16) {
       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
       VT = MVT::f32;
     }
     return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
   }
 
   // The CMP instruction is just an alias for SUBS, and representing it as
   // SUBS means that it's possible to get CSE with subtract operations.
   // A later phase can perform the optimization of setting the destination
   // register to WZR/XZR if it ends up being unused.
   unsigned Opcode = AArch64ISD::SUBS;
 
   if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
     // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
     // can be set differently by this operation. It comes down to whether
     // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
     // everything is fine. If not then the optimization is wrong. Thus general
     // comparisons are only valid if op2 != 0.
 
     // So, finally, the only LLVM-native comparisons that don't mention C and V
     // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
     // the absence of information about op2.
     Opcode = AArch64ISD::ADDS;
     RHS = RHS.getOperand(1);
   } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
              !isUnsignedIntSetCC(CC)) {
     // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
     // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
     // of the signed comparisons.
     Opcode = AArch64ISD::ANDS;
     RHS = LHS.getOperand(1);
     LHS = LHS.getOperand(0);
   }
 
   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
       .getValue(1);
 }
 
 /// \defgroup AArch64CCMP CMP;CCMP matching
 ///
 /// These functions deal with the formation of CMP;CCMP;... sequences.
 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
 /// a comparison. They set the NZCV flags to a predefined value if their
 /// predicate is false. This allows to express arbitrary conjunctions, for
 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
 /// expressed as:
 ///   cmp A
 ///   ccmp B, inv(CB), CA
 ///   check for CB flags
 ///
 /// This naturally lets us implement chains of AND operations with SETCC
 /// operands. And we can even implement some other situations by transforming
 /// them:
 ///   - We can implement (NEG SETCC) i.e. negating a single comparison by
 ///     negating the flags used in a CCMP/FCCMP operations.
 ///   - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
 ///     by negating the flags we test for afterwards. i.e.
 ///     NEG (CMP CCMP CCCMP ...) can be implemented.
 ///   - Note that we can only ever negate all previously processed results.
 ///     What we can not implement by flipping the flags to test is a negation
 ///     of two sub-trees (because the negation affects all sub-trees emitted so
 ///     far, so the 2nd sub-tree we emit would also affect the first).
 /// With those tools we can implement some OR operations:
 ///   - (OR (SETCC A) (SETCC B)) can be implemented via:
 ///     NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
 ///   - After transforming OR to NEG/AND combinations we may be able to use NEG
 ///     elimination rules from earlier to implement the whole thing as a
 ///     CCMP/FCCMP chain.
 ///
 /// As complete example:
 ///     or (or (setCA (cmp A)) (setCB (cmp B)))
 ///        (and (setCC (cmp C)) (setCD (cmp D)))"
 /// can be reassociated to:
 ///     or (and (setCC (cmp C)) setCD (cmp D))
 //         (or (setCA (cmp A)) (setCB (cmp B)))
 /// can be transformed to:
 ///     not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
 ///              (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
 /// which can be implemented as:
 ///   cmp C
 ///   ccmp D, inv(CD), CC
 ///   ccmp A, CA, inv(CD)
 ///   ccmp B, CB, inv(CA)
 ///   check for CB flags
 ///
 /// A counterexample is "or (and A B) (and C D)" which translates to
 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
 /// can only implement 1 of the inner (not) operations, but not both!
 /// @{
 
 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
                                          ISD::CondCode CC, SDValue CCOp,
                                          AArch64CC::CondCode Predicate,
                                          AArch64CC::CondCode OutCC,
                                          const SDLoc &DL, SelectionDAG &DAG) {
   unsigned Opcode = 0;
   const bool FullFP16 =
     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
 
   if (LHS.getValueType().isFloatingPoint()) {
     assert(LHS.getValueType() != MVT::f128);
     if (LHS.getValueType() == MVT::f16 && !FullFP16) {
       LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
       RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
     }
     Opcode = AArch64ISD::FCCMP;
   } else if (RHS.getOpcode() == ISD::SUB) {
     SDValue SubOp0 = RHS.getOperand(0);
     if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
       // See emitComparison() on why we can only do this for SETEQ and SETNE.
       Opcode = AArch64ISD::CCMN;
       RHS = RHS.getOperand(1);
     }
   }
   if (Opcode == 0)
     Opcode = AArch64ISD::CCMP;
 
   SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
   SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
   return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
 }
 
 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
 /// expressed as a conjunction. See \ref AArch64CCMP.
 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
 ///                     changing the conditions on the SETCC tests.
 ///                     (this means we can call emitConjunctionRec() with
 ///                      Negate==true on this sub-tree)
 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
 ///                     cannot do the negation naturally. We are required to
 ///                     emit the subtree first in this case.
 /// \param WillNegate   Is true if are called when the result of this
 ///                     subexpression must be negated. This happens when the
 ///                     outer expression is an OR. We can use this fact to know
 ///                     that we have a double negation (or (or ...) ...) that
 ///                     can be implemented for free.
 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
                                bool &MustBeFirst, bool WillNegate,
                                unsigned Depth = 0) {
   if (!Val.hasOneUse())
     return false;
   unsigned Opcode = Val->getOpcode();
   if (Opcode == ISD::SETCC) {
     if (Val->getOperand(0).getValueType() == MVT::f128)
       return false;
     CanNegate = true;
     MustBeFirst = false;
     return true;
   }
   // Protect against exponential runtime and stack overflow.
   if (Depth > 6)
     return false;
   if (Opcode == ISD::AND || Opcode == ISD::OR) {
     bool IsOR = Opcode == ISD::OR;
     SDValue O0 = Val->getOperand(0);
     SDValue O1 = Val->getOperand(1);
     bool CanNegateL;
     bool MustBeFirstL;
     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
       return false;
     bool CanNegateR;
     bool MustBeFirstR;
     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
       return false;
 
     if (MustBeFirstL && MustBeFirstR)
       return false;
 
     if (IsOR) {
       // For an OR expression we need to be able to naturally negate at least
       // one side or we cannot do the transformation at all.
       if (!CanNegateL && !CanNegateR)
         return false;
       // If we the result of the OR will be negated and we can naturally negate
       // the leafs, then this sub-tree as a whole negates naturally.
       CanNegate = WillNegate && CanNegateL && CanNegateR;
       // If we cannot naturally negate the whole sub-tree, then this must be
       // emitted first.
       MustBeFirst = !CanNegate;
     } else {
       assert(Opcode == ISD::AND && "Must be OR or AND");
       // We cannot naturally negate an AND operation.
       CanNegate = false;
       MustBeFirst = MustBeFirstL || MustBeFirstR;
     }
     return true;
   }
   return false;
 }
 
 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
 /// Tries to transform the given i1 producing node @p Val to a series compare
 /// and conditional compare operations. @returns an NZCV flags producing node
 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
 /// transformation was not possible.
 /// \p Negate is true if we want this sub-tree being negated just by changing
 /// SETCC conditions.
 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
     AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
     AArch64CC::CondCode Predicate) {
   // We're at a tree leaf, produce a conditional comparison operation.
   unsigned Opcode = Val->getOpcode();
   if (Opcode == ISD::SETCC) {
     SDValue LHS = Val->getOperand(0);
     SDValue RHS = Val->getOperand(1);
     ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
     bool isInteger = LHS.getValueType().isInteger();
     if (Negate)
       CC = getSetCCInverse(CC, isInteger);
     SDLoc DL(Val);
     // Determine OutCC and handle FP special case.
     if (isInteger) {
       OutCC = changeIntCCToAArch64CC(CC);
     } else {
       assert(LHS.getValueType().isFloatingPoint());
       AArch64CC::CondCode ExtraCC;
       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
       // Some floating point conditions can't be tested with a single condition
       // code. Construct an additional comparison in this case.
       if (ExtraCC != AArch64CC::AL) {
         SDValue ExtraCmp;
         if (!CCOp.getNode())
           ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
         else
           ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
                                                ExtraCC, DL, DAG);
         CCOp = ExtraCmp;
         Predicate = ExtraCC;
       }
     }
 
     // Produce a normal comparison if we are first in the chain
     if (!CCOp)
       return emitComparison(LHS, RHS, CC, DL, DAG);
     // Otherwise produce a ccmp.
     return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
                                      DAG);
   }
   assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
 
   bool IsOR = Opcode == ISD::OR;
 
   SDValue LHS = Val->getOperand(0);
   bool CanNegateL;
   bool MustBeFirstL;
   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
   assert(ValidL && "Valid conjunction/disjunction tree");
   (void)ValidL;
 
   SDValue RHS = Val->getOperand(1);
   bool CanNegateR;
   bool MustBeFirstR;
   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
   assert(ValidR && "Valid conjunction/disjunction tree");
   (void)ValidR;
 
   // Swap sub-tree that must come first to the right side.
   if (MustBeFirstL) {
     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
     std::swap(LHS, RHS);
     std::swap(CanNegateL, CanNegateR);
     std::swap(MustBeFirstL, MustBeFirstR);
   }
 
   bool NegateR;
   bool NegateAfterR;
   bool NegateL;
   bool NegateAfterAll;
   if (Opcode == ISD::OR) {
     // Swap the sub-tree that we can negate naturally to the left.
     if (!CanNegateL) {
       assert(CanNegateR && "at least one side must be negatable");
       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
       assert(!Negate);
       std::swap(LHS, RHS);
       NegateR = false;
       NegateAfterR = true;
     } else {
       // Negate the left sub-tree if possible, otherwise negate the result.
       NegateR = CanNegateR;
       NegateAfterR = !CanNegateR;
     }
     NegateL = true;
     NegateAfterAll = !Negate;
   } else {
     assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
     assert(!Negate && "Valid conjunction/disjunction tree");
 
     NegateL = false;
     NegateR = false;
     NegateAfterR = false;
     NegateAfterAll = false;
   }
 
   // Emit sub-trees.
   AArch64CC::CondCode RHSCC;
   SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
   if (NegateAfterR)
     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
   SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
   if (NegateAfterAll)
     OutCC = AArch64CC::getInvertedCondCode(OutCC);
   return CmpL;
 }
 
 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
 /// In some cases this is even possible with OR operations in the expression.
 /// See \ref AArch64CCMP.
 /// \see emitConjunctionRec().
 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
                                AArch64CC::CondCode &OutCC) {
   bool DummyCanNegate;
   bool DummyMustBeFirst;
   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
     return SDValue();
 
   return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
 }
 
 /// @}
 
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &AArch64cc, SelectionDAG &DAG,
                              const SDLoc &dl) {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     EVT VT = RHS.getValueType();
     uint64_t C = RHSC->getZExtValue();
     if (!isLegalArithImmed(C)) {
       // Constant does not fit, try adjusting it by one?
       switch (CC) {
       default:
         break;
       case ISD::SETLT:
       case ISD::SETGE:
         if ((VT == MVT::i32 && C != 0x80000000 &&
              isLegalArithImmed((uint32_t)(C - 1))) ||
             (VT == MVT::i64 && C != 0x80000000ULL &&
              isLegalArithImmed(C - 1ULL))) {
           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
           RHS = DAG.getConstant(C, dl, VT);
         }
         break;
       case ISD::SETULT:
       case ISD::SETUGE:
         if ((VT == MVT::i32 && C != 0 &&
              isLegalArithImmed((uint32_t)(C - 1))) ||
             (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
           RHS = DAG.getConstant(C, dl, VT);
         }
         break;
       case ISD::SETLE:
       case ISD::SETGT:
         if ((VT == MVT::i32 && C != INT32_MAX &&
              isLegalArithImmed((uint32_t)(C + 1))) ||
             (VT == MVT::i64 && C != INT64_MAX &&
              isLegalArithImmed(C + 1ULL))) {
           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
           RHS = DAG.getConstant(C, dl, VT);
         }
         break;
       case ISD::SETULE:
       case ISD::SETUGT:
         if ((VT == MVT::i32 && C != UINT32_MAX &&
              isLegalArithImmed((uint32_t)(C + 1))) ||
             (VT == MVT::i64 && C != UINT64_MAX &&
              isLegalArithImmed(C + 1ULL))) {
           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
           RHS = DAG.getConstant(C, dl, VT);
         }
         break;
       }
     }
   }
   SDValue Cmp;
   AArch64CC::CondCode AArch64CC;
   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
     const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
 
     // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
     // For the i8 operand, the largest immediate is 255, so this can be easily
     // encoded in the compare instruction. For the i16 operand, however, the
     // largest immediate cannot be encoded in the compare.
     // Therefore, use a sign extending load and cmn to avoid materializing the
     // -1 constant. For example,
     // movz w1, #65535
     // ldrh w0, [x0, #0]
     // cmp w0, w1
     // >
     // ldrsh w0, [x0, #0]
     // cmn w0, #1
     // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
     // if and only if (sext LHS) == (sext RHS). The checks are in place to
     // ensure both the LHS and RHS are truly zero extended and to make sure the
     // transformation is profitable.
     if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
         cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
         cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
         LHS.getNode()->hasNUsesOfValue(1, 0)) {
       int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
       if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
         SDValue SExt =
             DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
                         DAG.getValueType(MVT::i16));
         Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
                                                    RHS.getValueType()),
                              CC, dl, DAG);
         AArch64CC = changeIntCCToAArch64CC(CC);
       }
     }
 
     if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
       if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
         if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
           AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
       }
     }
   }
 
   if (!Cmp) {
     Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
     AArch64CC = changeIntCCToAArch64CC(CC);
   }
   AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
   return Cmp;
 }
 
 static std::pair<SDValue, SDValue>
 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
   assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
          "Unsupported value type");
   SDValue Value, Overflow;
   SDLoc DL(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   unsigned Opc = 0;
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Unknown overflow instruction!");
   case ISD::SADDO:
     Opc = AArch64ISD::ADDS;
     CC = AArch64CC::VS;
     break;
   case ISD::UADDO:
     Opc = AArch64ISD::ADDS;
     CC = AArch64CC::HS;
     break;
   case ISD::SSUBO:
     Opc = AArch64ISD::SUBS;
     CC = AArch64CC::VS;
     break;
   case ISD::USUBO:
     Opc = AArch64ISD::SUBS;
     CC = AArch64CC::LO;
     break;
   // Multiply needs a little bit extra work.
   case ISD::SMULO:
   case ISD::UMULO: {
     CC = AArch64CC::NE;
     bool IsSigned = Op.getOpcode() == ISD::SMULO;
     if (Op.getValueType() == MVT::i32) {
       unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
       // For a 32 bit multiply with overflow check we want the instruction
       // selector to generate a widening multiply (SMADDL/UMADDL). For that we
       // need to generate the following pattern:
       // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
       LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
       RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
       SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
       SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
                                 DAG.getConstant(0, DL, MVT::i64));
       // On AArch64 the upper 32 bits are always zero extended for a 32 bit
       // operation. We need to clear out the upper 32 bits, because we used a
       // widening multiply that wrote all 64 bits. In the end this should be a
       // noop.
       Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
       if (IsSigned) {
         // The signed overflow check requires more than just a simple check for
         // any bit set in the upper 32 bits of the result. These bits could be
         // just the sign bits of a negative number. To perform the overflow
         // check we have to arithmetic shift right the 32nd bit of the result by
         // 31 bits. Then we compare the result to the upper 32 bits.
         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
                                         DAG.getConstant(32, DL, MVT::i64));
         UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
         SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
                                         DAG.getConstant(31, DL, MVT::i64));
         // It is important that LowerBits is last, otherwise the arithmetic
         // shift will not be folded into the compare (SUBS).
         SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
         Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
                        .getValue(1);
       } else {
         // The overflow check for unsigned multiply is easy. We only need to
         // check if any of the upper 32 bits are set. This can be done with a
         // CMP (shifted register). For that we need to generate the following
         // pattern:
         // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
                                         DAG.getConstant(32, DL, MVT::i64));
         SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
         Overflow =
             DAG.getNode(AArch64ISD::SUBS, DL, VTs,
                         DAG.getConstant(0, DL, MVT::i64),
                         UpperBits).getValue(1);
       }
       break;
     }
     assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
     // For the 64 bit multiply
     Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
     if (IsSigned) {
       SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
       SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
                                       DAG.getConstant(63, DL, MVT::i64));
       // It is important that LowerBits is last, otherwise the arithmetic
       // shift will not be folded into the compare (SUBS).
       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
       Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
                      .getValue(1);
     } else {
       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
       Overflow =
           DAG.getNode(AArch64ISD::SUBS, DL, VTs,
                       DAG.getConstant(0, DL, MVT::i64),
                       UpperBits).getValue(1);
     }
     break;
   }
   } // switch (...)
 
   if (Opc) {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
 
     // Emit the AArch64 operation with overflow check.
     Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
     Overflow = Value.getValue(1);
   }
   return std::make_pair(Value, Overflow);
 }
 
 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
                                              RTLIB::Libcall Call) const {
   SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
   return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
 }
 
 // Returns true if the given Op is the overflow flag result of an overflow
 // intrinsic operation.
 static bool isOverflowIntrOpRes(SDValue Op) {
   unsigned Opc = Op.getOpcode();
   return (Op.getResNo() == 1 &&
           (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
            Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
 }
 
 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
   SDValue Sel = Op.getOperand(0);
   SDValue Other = Op.getOperand(1);
   SDLoc dl(Sel);
 
   // If the operand is an overflow checking operation, invert the condition
   // code and kill the Not operation. I.e., transform:
   // (xor (overflow_op_bool, 1))
   //   -->
   // (csel 1, 0, invert(cc), overflow_op_bool)
   // ... which later gets transformed to just a cset instruction with an
   // inverted condition code, rather than a cset + eor sequence.
   if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
       return SDValue();
 
     SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
     SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
     AArch64CC::CondCode CC;
     SDValue Value, Overflow;
     std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
     SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
     return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
                        CCVal, Overflow);
   }
   // If neither operand is a SELECT_CC, give up.
   if (Sel.getOpcode() != ISD::SELECT_CC)
     std::swap(Sel, Other);
   if (Sel.getOpcode() != ISD::SELECT_CC)
     return Op;
 
   // The folding we want to perform is:
   // (xor x, (select_cc a, b, cc, 0, -1) )
   //   -->
   // (csel x, (xor x, -1), cc ...)
   //
   // The latter will get matched to a CSINV instruction.
 
   ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
   SDValue LHS = Sel.getOperand(0);
   SDValue RHS = Sel.getOperand(1);
   SDValue TVal = Sel.getOperand(2);
   SDValue FVal = Sel.getOperand(3);
 
   // FIXME: This could be generalized to non-integer comparisons.
   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
     return Op;
 
   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
 
   // The values aren't constants, this isn't the pattern we're looking for.
   if (!CFVal || !CTVal)
     return Op;
 
   // We can commute the SELECT_CC by inverting the condition.  This
   // might be needed to make this fit into a CSINV pattern.
   if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
     std::swap(TVal, FVal);
     std::swap(CTVal, CFVal);
     CC = ISD::getSetCCInverse(CC, true);
   }
 
   // If the constants line up, perform the transform!
   if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
     SDValue CCVal;
     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
 
     FVal = Other;
     TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
                        DAG.getConstant(-1ULL, dl, Other.getValueType()));
 
     return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
                        CCVal, Cmp);
   }
 
   return Op;
 }
 
 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
 
   unsigned Opc;
   bool ExtraOp = false;
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Invalid code");
   case ISD::ADDC:
     Opc = AArch64ISD::ADDS;
     break;
   case ISD::SUBC:
     Opc = AArch64ISD::SUBS;
     break;
   case ISD::ADDE:
     Opc = AArch64ISD::ADCS;
     ExtraOp = true;
     break;
   case ISD::SUBE:
     Opc = AArch64ISD::SBCS;
     ExtraOp = true;
     break;
   }
 
   if (!ExtraOp)
     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
                      Op.getOperand(2));
 }
 
 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
     return SDValue();
 
   SDLoc dl(Op);
   AArch64CC::CondCode CC;
   // The actual operation that sets the overflow or carry flag.
   SDValue Value, Overflow;
   std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
 
   // We use 0 and 1 as false and true values.
   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
 
   // We use an inverted condition, because the conditional select is inverted
   // too. This will allow it to be selected to a single instruction:
   // CSINC Wd, WZR, WZR, invert(cond).
   SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
   Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
                          CCVal, Overflow);
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
 }
 
 // Prefetch operands are:
 // 1: Address to prefetch
 // 2: bool isWrite
 // 3: int locality (0 = no locality ... 3 = extreme locality)
 // 4: bool isDataCache
 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
   unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
 
   bool IsStream = !Locality;
   // When the locality number is set
   if (Locality) {
     // The front-end should have filtered out the out-of-range values
     assert(Locality <= 3 && "Prefetch locality out-of-range");
     // The locality degree is the opposite of the cache speed.
     // Put the number the other way around.
     // The encoding starts at 0 for level 1
     Locality = 3 - Locality;
   }
 
   // built the mask value encoding the expected behavior.
   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
                    (!IsData << 3) |     // IsDataCache bit
                    (Locality << 1) |    // Cache level bits
                    (unsigned)IsStream;  // Stream bit
   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
                      DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
 }
 
 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
                                               SelectionDAG &DAG) const {
   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
 
   RTLIB::Libcall LC;
   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
 
   return LowerF128Call(Op, DAG, LC);
 }
 
 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
                                              SelectionDAG &DAG) const {
   if (Op.getOperand(0).getValueType() != MVT::f128) {
     // It's legal except when f128 is involved
     return Op;
   }
 
   RTLIB::Libcall LC;
   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
 
   // FP_ROUND node has a second operand indicating whether it is known to be
   // precise. That doesn't take part in the LibCall so we can't directly use
   // LowerF128Call.
   SDValue SrcVal = Op.getOperand(0);
   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
                      SDLoc(Op)).first;
 }
 
 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   // Any additional optimization in this function should be recorded
   // in the cost tables.
   EVT InVT = Op.getOperand(0).getValueType();
   EVT VT = Op.getValueType();
   unsigned NumElts = InVT.getVectorNumElements();
 
   // f16 vectors are promoted to f32 before a conversion.
   if (InVT.getVectorElementType() == MVT::f16) {
     MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
     SDLoc dl(Op);
     return DAG.getNode(
         Op.getOpcode(), dl, Op.getValueType(),
         DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
   }
 
   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
     SDLoc dl(Op);
     SDValue Cv =
         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
                     Op.getOperand(0));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
   }
 
   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
     SDLoc dl(Op);
     MVT ExtVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
                          VT.getVectorNumElements());
     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
   }
 
   // Type changing conversions are illegal.
   return Op;
 }
 
 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
                                               SelectionDAG &DAG) const {
   if (Op.getOperand(0).getValueType().isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
 
   // f16 conversions are promoted to f32 when full fp16 is not supported.
   if (Op.getOperand(0).getValueType() == MVT::f16 &&
       !Subtarget->hasFullFP16()) {
     SDLoc dl(Op);
     return DAG.getNode(
         Op.getOpcode(), dl, Op.getValueType(),
         DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
   }
 
   if (Op.getOperand(0).getValueType() != MVT::f128) {
     // It's legal except when f128 is involved
     return Op;
   }
 
   RTLIB::Libcall LC;
   if (Op.getOpcode() == ISD::FP_TO_SINT)
     LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
   else
     LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
 
   SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
   return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
 }
 
 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   // Any additional optimization in this function should be recorded
   // in the cost tables.
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
   SDValue In = Op.getOperand(0);
   EVT InVT = In.getValueType();
 
   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
     MVT CastVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
                          InVT.getVectorNumElements());
     In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
     return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
   }
 
   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
     unsigned CastOpc =
         Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     EVT CastVT = VT.changeVectorElementTypeToInteger();
     In = DAG.getNode(CastOpc, dl, CastVT, In);
     return DAG.getNode(Op.getOpcode(), dl, VT, In);
   }
 
   return Op;
 }
 
 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
                                             SelectionDAG &DAG) const {
   if (Op.getValueType().isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
 
   // f16 conversions are promoted to f32 when full fp16 is not supported.
   if (Op.getValueType() == MVT::f16 &&
       !Subtarget->hasFullFP16()) {
     SDLoc dl(Op);
     return DAG.getNode(
         ISD::FP_ROUND, dl, MVT::f16,
         DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
         DAG.getIntPtrConstant(0, dl));
   }
 
   // i128 conversions are libcalls.
   if (Op.getOperand(0).getValueType() == MVT::i128)
     return SDValue();
 
   // Other conversions are legal, unless it's to the completely software-based
   // fp128.
   if (Op.getValueType() != MVT::f128)
     return Op;
 
   RTLIB::Libcall LC;
   if (Op.getOpcode() == ISD::SINT_TO_FP)
     LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
   else
     LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
 
   return LowerF128Call(Op, DAG, LC);
 }
 
 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
                                             SelectionDAG &DAG) const {
   // For iOS, we want to call an alternative entry point: __sincos_stret,
   // which returns the values in two S / D registers.
   SDLoc dl(Op);
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
   ArgListTy Args;
   ArgListEntry Entry;
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
   Entry.IsSExt = false;
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
   RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
                                         : RTLIB::SINCOS_STRET_F32;
   const char *LibcallName = getLibcallName(LC);
   SDValue Callee =
       DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
 
   StructType *RetTy = StructType::get(ArgTy, ArgTy);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
       .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
 }
 
 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
   if (Op.getValueType() != MVT::f16)
     return SDValue();
 
   assert(Op.getOperand(0).getValueType() == MVT::i16);
   SDLoc DL(Op);
 
   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
   Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
   return SDValue(
       DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
       0);
 }
 
 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
   if (OrigVT.getSizeInBits() >= 64)
     return OrigVT;
 
   assert(OrigVT.isSimple() && "Expecting a simple value type");
 
   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
   switch (OrigSimpleTy) {
   default: llvm_unreachable("Unexpected Vector Type");
   case MVT::v2i8:
   case MVT::v2i16:
      return MVT::v2i32;
   case MVT::v4i8:
     return  MVT::v4i16;
   }
 }
 
 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
                                                  const EVT &OrigTy,
                                                  const EVT &ExtTy,
                                                  unsigned ExtOpcode) {
   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
   // 64-bits we need to insert a new extension so that it will be 64-bits.
   assert(ExtTy.is128BitVector() && "Unexpected extension size");
   if (OrigTy.getSizeInBits() >= 64)
     return N;
 
   // Must extend size to at least 64 bits to be used as an operand for VMULL.
   EVT NewVT = getExtensionTo64Bits(OrigTy);
 
   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
 }
 
 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
                                    bool isSigned) {
   EVT VT = N->getValueType(0);
 
   if (N->getOpcode() != ISD::BUILD_VECTOR)
     return false;
 
   for (const SDValue &Elt : N->op_values()) {
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
       unsigned EltSize = VT.getScalarSizeInBits();
       unsigned HalfSize = EltSize / 2;
       if (isSigned) {
         if (!isIntN(HalfSize, C->getSExtValue()))
           return false;
       } else {
         if (!isUIntN(HalfSize, C->getZExtValue()))
           return false;
       }
       continue;
     }
     return false;
   }
 
   return true;
 }
 
 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
     return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
                                              N->getOperand(0)->getValueType(0),
                                              N->getValueType(0),
                                              N->getOpcode());
 
   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
   unsigned EltSize = VT.getScalarSizeInBits() / 2;
   unsigned NumElts = VT.getVectorNumElements();
   MVT TruncVT = MVT::getIntegerVT(EltSize);
   SmallVector<SDValue, 8> Ops;
   for (unsigned i = 0; i != NumElts; ++i) {
     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
     const APInt &CInt = C->getAPIntValue();
     // Element types smaller than 32 bits are not legal, so use i32 elements.
     // The values are implicitly truncated so sext vs. zext doesn't matter.
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
   }
   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
 }
 
 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
   return N->getOpcode() == ISD::SIGN_EXTEND ||
          isExtendedBUILD_VECTOR(N, DAG, true);
 }
 
 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
   return N->getOpcode() == ISD::ZERO_EXTEND ||
          isExtendedBUILD_VECTOR(N, DAG, false);
 }
 
 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
   unsigned Opcode = N->getOpcode();
   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
     SDNode *N0 = N->getOperand(0).getNode();
     SDNode *N1 = N->getOperand(1).getNode();
     return N0->hasOneUse() && N1->hasOneUse() &&
       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
   }
   return false;
 }
 
 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
   unsigned Opcode = N->getOpcode();
   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
     SDNode *N0 = N->getOperand(0).getNode();
     SDNode *N1 = N->getOperand(1).getNode();
     return N0->hasOneUse() && N1->hasOneUse() &&
       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
   }
   return false;
 }
 
 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The rounding mode is in bits 23:22 of the FPSCR.
   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
   // so that the shift + and get folded into a bitfield extract.
   SDLoc dl(Op);
 
   SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
                                 DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
                                                 MVT::i64));
   SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
                                   DAG.getConstant(1U << 22, dl, MVT::i32));
   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
                               DAG.getConstant(22, dl, MVT::i32));
   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
                      DAG.getConstant(3, dl, MVT::i32));
 }
 
 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   // Multiplications are only custom-lowered for 128-bit vectors so that
   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
   EVT VT = Op.getValueType();
   assert(VT.is128BitVector() && VT.isInteger() &&
          "unexpected type for custom-lowering ISD::MUL");
   SDNode *N0 = Op.getOperand(0).getNode();
   SDNode *N1 = Op.getOperand(1).getNode();
   unsigned NewOpc = 0;
   bool isMLA = false;
   bool isN0SExt = isSignExtended(N0, DAG);
   bool isN1SExt = isSignExtended(N1, DAG);
   if (isN0SExt && isN1SExt)
     NewOpc = AArch64ISD::SMULL;
   else {
     bool isN0ZExt = isZeroExtended(N0, DAG);
     bool isN1ZExt = isZeroExtended(N1, DAG);
     if (isN0ZExt && isN1ZExt)
       NewOpc = AArch64ISD::UMULL;
     else if (isN1SExt || isN1ZExt) {
       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
       if (isN1SExt && isAddSubSExt(N0, DAG)) {
         NewOpc = AArch64ISD::SMULL;
         isMLA = true;
       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
         NewOpc =  AArch64ISD::UMULL;
         isMLA = true;
       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
         std::swap(N0, N1);
         NewOpc =  AArch64ISD::UMULL;
         isMLA = true;
       }
     }
 
     if (!NewOpc) {
       if (VT == MVT::v2i64)
         // Fall through to expand this.  It is not legal.
         return SDValue();
       else
         // Other vector multiplications are legal.
         return Op;
     }
   }
 
   // Legalize to a S/UMULL instruction
   SDLoc DL(Op);
   SDValue Op0;
   SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
   if (!isMLA) {
     Op0 = skipExtensionForVectorMULL(N0, DAG);
     assert(Op0.getValueType().is64BitVector() &&
            Op1.getValueType().is64BitVector() &&
            "unexpected types for extended operands to VMULL");
     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
   }
   // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
   // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
   // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
   SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
   SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
   EVT Op1VT = Op1.getValueType();
   return DAG.getNode(N0->getOpcode(), DL, VT,
                      DAG.getNode(NewOpc, DL, VT,
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
                      DAG.getNode(NewOpc, DL, VT,
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
 // Lower vector multiply high (ISD::MULHS and ISD::MULHU).
 static SDValue LowerMULH(SDValue Op, SelectionDAG &DAG) {
   // Multiplications are only custom-lowered for 128-bit vectors so that
   // {S,U}MULL{2} can be detected.  Otherwise v2i64 multiplications are not
   // legal.
   EVT VT = Op.getValueType();
   assert(VT.is128BitVector() && VT.isInteger() &&
          "unexpected type for custom-lowering ISD::MULH{U,S}");
 
   SDValue V0 = Op.getOperand(0);
   SDValue V1 = Op.getOperand(1);
 
   SDLoc DL(Op);
 
   EVT ExtractVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
 
   // We turn (V0 mulhs/mulhu V1) to:
   //
   // (uzp2 (smull (extract_subvector (ExtractVT V128:V0, (i64 0)),
   //              (extract_subvector (ExtractVT V128:V1, (i64 0))))),
   //       (smull (extract_subvector (ExtractVT V128:V0, (i64 VMull2Idx)),
   //              (extract_subvector (ExtractVT V128:V2, (i64 VMull2Idx))))))
   //
   // Where ExtractVT is a subvector with half number of elements, and
   // VMullIdx2 is the index of the middle element (the high part).
   //
   // The vector hight part extract and multiply will be matched against
   // {S,U}MULL{v16i8_v8i16,v8i16_v4i32,v4i32_v2i64} which in turn will
   // issue a {s}mull2 instruction.
   //
   // This basically multiply the lower subvector with '{s,u}mull', the high
   // subvector with '{s,u}mull2', and shuffle both results high part in
   // resulting vector.
   unsigned Mull2VectorIdx = VT.getVectorNumElements () / 2;
   SDValue VMullIdx = DAG.getConstant(0, DL, MVT::i64);
   SDValue VMull2Idx = DAG.getConstant(Mull2VectorIdx, DL, MVT::i64);
 
   SDValue VMullV0 =
     DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMullIdx);
   SDValue VMullV1 =
     DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMullIdx);
 
   SDValue VMull2V0 =
     DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V0, VMull2Idx);
   SDValue VMull2V1 =
     DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, V1, VMull2Idx);
 
   unsigned MullOpc = Op.getOpcode() == ISD::MULHS ? AArch64ISD::SMULL
                                                   : AArch64ISD::UMULL;
 
   EVT MullVT = ExtractVT.widenIntegerVectorElementType(*DAG.getContext());
   SDValue Mull  = DAG.getNode(MullOpc, DL, MullVT, VMullV0, VMullV1);
   SDValue Mull2 = DAG.getNode(MullOpc, DL, MullVT, VMull2V0, VMull2V1);
 
   Mull  = DAG.getNode(ISD::BITCAST, DL, VT, Mull);
   Mull2 = DAG.getNode(ISD::BITCAST, DL, VT, Mull2);
 
   return DAG.getNode(AArch64ISD::UZP2, DL, VT, Mull, Mull2);
 }
 
 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
   case Intrinsic::thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
   }
   case Intrinsic::aarch64_neon_abs:
     return DAG.getNode(ISD::ABS, dl, Op.getValueType(),
                        Op.getOperand(1));
   case Intrinsic::aarch64_neon_smax:
     return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::aarch64_neon_umax:
     return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::aarch64_neon_smin:
     return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::aarch64_neon_umin:
     return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   }
 }
 
 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
                                         EVT VT, EVT MemVT,
                                         SelectionDAG &DAG) {
   assert(VT.isVector() && "VT should be a vector type");
   assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
 
   SDValue Value = ST->getValue();
 
   // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
   // the word lane which represent the v4i8 subvector.  It optimizes the store
   // to:
   //
   //   xtn  v0.8b, v0.8h
   //   str  s0, [x0]
 
   SDValue Undef = DAG.getUNDEF(MVT::i16);
   SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
                                         {Undef, Undef, Undef, Undef});
 
   SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
                                  Value, UndefVec);
   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
 
   Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
   SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
                                      Trunc, DAG.getConstant(0, DL, MVT::i64));
 
   return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
                       ST->getBasePtr(), ST->getMemOperand());
 }
 
 // Custom lowering for any store, vector or scalar and/or default or with
 // a truncate operations.  Currently only custom lower truncate operation
 // from vector v4i16 to v4i8.
 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
                                           SelectionDAG &DAG) const {
   SDLoc Dl(Op);
   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
   assert (StoreNode && "Can only custom lower store nodes");
 
   SDValue Value = StoreNode->getValue();
 
   EVT VT = Value.getValueType();
   EVT MemVT = StoreNode->getMemoryVT();
 
   assert (VT.isVector() && "Can only custom lower vector store types");
 
   unsigned AS = StoreNode->getAddressSpace();
   unsigned Align = StoreNode->getAlignment();
   if (Align < MemVT.getStoreSize() &&
       !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
     return scalarizeVectorStore(StoreNode, DAG);
   }
 
   if (StoreNode->isTruncatingStore()) {
     return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
   }
 
   return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Custom lowering: ");
   LLVM_DEBUG(Op.dump());
 
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("unimplemented operand");
     return SDValue();
   case ISD::BITCAST:
     return LowerBITCAST(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:
     return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SETCC:
     return LowerSETCC(Op, DAG);
   case ISD::BR_CC:
     return LowerBR_CC(Op, DAG);
   case ISD::SELECT:
     return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:
     return LowerSELECT_CC(Op, DAG);
   case ISD::JumpTable:
     return LowerJumpTable(Op, DAG);
   case ISD::ConstantPool:
     return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:
     return LowerBlockAddress(Op, DAG);
   case ISD::VASTART:
     return LowerVASTART(Op, DAG);
   case ISD::VACOPY:
     return LowerVACOPY(Op, DAG);
   case ISD::VAARG:
     return LowerVAARG(Op, DAG);
   case ISD::ADDC:
   case ISD::ADDE:
   case ISD::SUBC:
   case ISD::SUBE:
     return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
   case ISD::USUBO:
   case ISD::SMULO:
   case ISD::UMULO:
     return LowerXALUO(Op, DAG);
   case ISD::FADD:
     return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
   case ISD::FSUB:
     return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
   case ISD::FMUL:
     return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
   case ISD::FDIV:
     return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
   case ISD::FP_ROUND:
     return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND:
     return LowerFP_EXTEND(Op, DAG);
   case ISD::FRAMEADDR:
     return LowerFRAMEADDR(Op, DAG);
   case ISD::RETURNADDR:
     return LowerRETURNADDR(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::BUILD_VECTOR:
     return LowerBUILD_VECTOR(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
     return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:
     return LowerVectorSRA_SRL_SHL(Op, DAG);
   case ISD::SHL_PARTS:
     return LowerShiftLeftParts(Op, DAG);
   case ISD::SRL_PARTS:
   case ISD::SRA_PARTS:
     return LowerShiftRightParts(Op, DAG);
   case ISD::CTPOP:
     return LowerCTPOP(Op, DAG);
   case ISD::FCOPYSIGN:
     return LowerFCOPYSIGN(Op, DAG);
   case ISD::AND:
     return LowerVectorAND(Op, DAG);
   case ISD::OR:
     return LowerVectorOR(Op, DAG);
   case ISD::XOR:
     return LowerXOR(Op, DAG);
   case ISD::PREFETCH:
     return LowerPREFETCH(Op, DAG);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
     return LowerINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return LowerFP_TO_INT(Op, DAG);
   case ISD::FSINCOS:
     return LowerFSINCOS(Op, DAG);
   case ISD::FLT_ROUNDS_:
     return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:
     return LowerMUL(Op, DAG);
   case ISD::MULHS:
   case ISD::MULHU:
     return LowerMULH(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_SMAX:
   case ISD::VECREDUCE_SMIN:
   case ISD::VECREDUCE_UMAX:
   case ISD::VECREDUCE_UMIN:
   case ISD::VECREDUCE_FMAX:
   case ISD::VECREDUCE_FMIN:
     return LowerVECREDUCE(Op, DAG);
   case ISD::ATOMIC_LOAD_SUB:
     return LowerATOMIC_LOAD_SUB(Op, DAG);
   case ISD::ATOMIC_LOAD_AND:
     return LowerATOMIC_LOAD_AND(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
   }
 }
 
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
 #include "AArch64GenCallingConv.inc"
 
 /// Selects the correct CCAssignFn for a given CallingConvention value.
 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                      bool IsVarArg) const {
   switch (CC) {
   default:
     report_fatal_error("Unsupported calling convention.");
   case CallingConv::WebKit_JS:
     return CC_AArch64_WebKit_JS;
   case CallingConv::GHC:
     return CC_AArch64_GHC;
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::PreserveMost:
   case CallingConv::CXX_FAST_TLS:
   case CallingConv::Swift:
     if (Subtarget->isTargetWindows() && IsVarArg)
       return CC_AArch64_Win64_VarArg;
     if (!Subtarget->isTargetDarwin())
       return CC_AArch64_AAPCS;
     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
   case CallingConv::Win64:
     return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
   }
 }
 
 CCAssignFn *
 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
   return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
                                       : RetCC_AArch64_AAPCS;
 }
 
 SDValue AArch64TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
   // At this point, Ins[].VT may already be promoted to i32. To correctly
   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
   // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
   // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
   // LocVT.
   unsigned NumArgs = Ins.size();
   Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
   unsigned CurArgIdx = 0;
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ValVT = Ins[i].VT;
     if (Ins[i].isOrigArg()) {
       std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
       CurArgIdx = Ins[i].getOrigArgIndex();
 
       // Get type of the original argument.
       EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
                                   /*AllowUnknown*/ true);
       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
         ValVT = MVT::i8;
       else if (ActualMVT == MVT::i16)
         ValVT = MVT::i16;
     }
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
     bool Res =
         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
     assert(!Res && "Call operand has unhandled type");
     (void)Res;
   }
   assert(ArgLocs.size() == Ins.size());
   SmallVector<SDValue, 16> ArgValues;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
 
     if (Ins[i].Flags.isByVal()) {
       // Byval is used for HFAs in the PCS, but the system should work in a
       // non-compliant manner for larger structs.
       EVT PtrVT = getPointerTy(DAG.getDataLayout());
       int Size = Ins[i].Flags.getByValSize();
       unsigned NumRegs = (Size + 7) / 8;
 
       // FIXME: This works on big-endian for composite byvals, which are the common
       // case. It should also work for fundamental types too.
       unsigned FrameIdx =
         MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
       InVals.push_back(FrameIdxN);
 
       continue;
     }
 
     if (VA.isRegLoc()) {
       // Arguments stored in registers.
       EVT RegVT = VA.getLocVT();
 
       SDValue ArgValue;
       const TargetRegisterClass *RC;
 
       if (RegVT == MVT::i32)
         RC = &AArch64::GPR32RegClass;
       else if (RegVT == MVT::i64)
         RC = &AArch64::GPR64RegClass;
       else if (RegVT == MVT::f16)
         RC = &AArch64::FPR16RegClass;
       else if (RegVT == MVT::f32)
         RC = &AArch64::FPR32RegClass;
       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
         RC = &AArch64::FPR64RegClass;
       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
         RC = &AArch64::FPR128RegClass;
       else
         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
       // Transform the arguments in physical registers into virtual ones.
       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
       ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
       // If this is an 8, 16 or 32-bit value, it is really passed promoted
       // to 64 bits.  Insert an assert[sz]ext to capture this, then
       // truncate to the right size.
       switch (VA.getLocInfo()) {
       default:
         llvm_unreachable("Unknown loc info!");
       case CCValAssign::Full:
         break;
       case CCValAssign::BCvt:
         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
         break;
       case CCValAssign::AExt:
       case CCValAssign::SExt:
       case CCValAssign::ZExt:
         // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
         // nodes after our lowering.
         assert(RegVT == Ins[i].VT && "incorrect register location selected");
         break;
       }
 
       InVals.push_back(ArgValue);
 
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
       unsigned ArgOffset = VA.getLocMemOffset();
       unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
 
       uint32_t BEAlign = 0;
       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
           !Ins[i].Flags.isInConsecutiveRegs())
         BEAlign = 8 - ArgSize;
 
       int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
       SDValue ArgValue;
 
       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
       MVT MemVT = VA.getValVT();
 
       switch (VA.getLocInfo()) {
       default:
         break;
       case CCValAssign::BCvt:
         MemVT = VA.getLocVT();
         break;
       case CCValAssign::SExt:
         ExtType = ISD::SEXTLOAD;
         break;
       case CCValAssign::ZExt:
         ExtType = ISD::ZEXTLOAD;
         break;
       case CCValAssign::AExt:
         ExtType = ISD::EXTLOAD;
         break;
       }
 
       ArgValue = DAG.getExtLoad(
           ExtType, DL, VA.getLocVT(), Chain, FIN,
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
           MemVT);
 
       InVals.push_back(ArgValue);
     }
   }
 
   // varargs
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   if (isVarArg) {
     if (!Subtarget->isTargetDarwin() || IsWin64) {
       // The AAPCS variadic function ABI is identical to the non-variadic
       // one. As a result there may be more arguments in registers and we should
       // save them for future reference.
       // Win64 variadic functions also pass arguments in registers, but all float
       // arguments are passed in integer registers.
       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
     }
 
     // This will point to the next argument passed via stack.
     unsigned StackOffset = CCInfo.getNextStackOffset();
     // We currently pass all varargs at 8-byte alignment.
     StackOffset = ((StackOffset + 7) & ~7);
     FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
   }
 
   unsigned StackArgSize = CCInfo.getNextStackOffset();
   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
     // This is a non-standard ABI so by fiat I say we're allowed to make full
     // use of the stack area to be popped, which must be aligned to 16 bytes in
     // any case:
     StackArgSize = alignTo(StackArgSize, 16);
 
     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
     // a multiple of 16.
     FuncInfo->setArgumentStackToRestore(StackArgSize);
 
     // This realignment carries over to the available bytes below. Our own
     // callers will guarantee the space is free by giving an aligned value to
     // CALLSEQ_START.
   }
   // Even if we're not expected to free up the space, it's useful to know how
   // much is there while considering tail calls (because we can reuse it).
   FuncInfo->setBytesInStackArgArea(StackArgSize);
 
   return Chain;
 }
 
 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
                                                 SelectionDAG &DAG,
                                                 const SDLoc &DL,
                                                 SDValue &Chain) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
 
   SmallVector<SDValue, 8> MemOps;
 
   static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
                                           AArch64::X3, AArch64::X4, AArch64::X5,
                                           AArch64::X6, AArch64::X7 };
   static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
   unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
 
   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
   int GPRIdx = 0;
   if (GPRSaveSize != 0) {
     if (IsWin64) {
       GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
       if (GPRSaveSize & 15)
         // The extra size here, if triggered, will always be 8.
         MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
     } else
       GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
 
     SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
 
     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
       SDValue Store = DAG.getStore(
           Val.getValue(1), DL, Val, FIN,
           IsWin64
               ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
                                                   GPRIdx,
                                                   (i - FirstVariadicGPR) * 8)
               : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
       MemOps.push_back(Store);
       FIN =
           DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
     }
   }
   FuncInfo->setVarArgsGPRIndex(GPRIdx);
   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
 
   if (Subtarget->hasFPARMv8() && !IsWin64) {
     static const MCPhysReg FPRArgRegs[] = {
         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
     static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
     unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
 
     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
     int FPRIdx = 0;
     if (FPRSaveSize != 0) {
       FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
 
       SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
 
       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
 
         SDValue Store = DAG.getStore(
             Val.getValue(1), DL, Val, FIN,
             MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
         MemOps.push_back(Store);
         FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
                           DAG.getConstant(16, DL, PtrVT));
       }
     }
     FuncInfo->setVarArgsFPRIndex(FPRIdx);
     FuncInfo->setVarArgsFPRSize(FPRSaveSize);
   }
 
   if (!MemOps.empty()) {
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
   }
 }
 
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 SDValue AArch64TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
     SDValue ThisVal) const {
   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign VA = RVLocs[i];
 
     // Pass 'this' value directly from the argument to return value, to avoid
     // reg unit interference
     if (i == 0 && isThisReturn) {
       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
              "unexpected return calling convention register assignment");
       InVals.push_back(ThisVal);
       continue;
     }
 
     SDValue Val =
         DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
     Chain = Val.getValue(1);
     InFlag = Val.getValue(2);
 
     switch (VA.getLocInfo()) {
     default:
       llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full:
       break;
     case CCValAssign::BCvt:
       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
       break;
     }
 
     InVals.push_back(Val);
   }
 
   return Chain;
 }
 
 /// Return true if the calling convention is one that we can guarantee TCO for.
 static bool canGuaranteeTCO(CallingConv::ID CC) {
   return CC == CallingConv::Fast;
 }
 
 /// Return true if we might ever do TCO for calls with this calling convention.
 static bool mayTailCallThisCC(CallingConv::ID CC) {
   switch (CC) {
   case CallingConv::C:
   case CallingConv::PreserveMost:
   case CallingConv::Swift:
     return true;
   default:
     return canGuaranteeTCO(CC);
   }
 }
 
 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   if (!mayTailCallThisCC(CalleeCC))
     return false;
 
   MachineFunction &MF = DAG.getMachineFunction();
   const Function &CallerF = MF.getFunction();
   CallingConv::ID CallerCC = CallerF.getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
 
   // Byval parameters hand the function a pointer directly into the stack area
   // we want to reuse during a tail call. Working around this *is* possible (see
   // X86) but less efficient and uglier in LowerCall.
   for (Function::const_arg_iterator i = CallerF.arg_begin(),
                                     e = CallerF.arg_end();
        i != e; ++i)
     if (i->hasByValAttr())
       return false;
 
   if (getTargetMachine().Options.GuaranteedTailCallOpt)
     return canGuaranteeTCO(CalleeCC) && CCMatch;
 
   // Externally-defined functions with weak linkage should not be
   // tail-called on AArch64 when the OS does not support dynamic
   // pre-emption of symbols, as the AAELF spec requires normal calls
   // to undefined weak functions to be replaced with a NOP or jump to the
   // next instruction. The behaviour of branch instructions in this
   // situation (as used for tail calls) is implementation-defined, so we
   // cannot rely on the linker replacing the tail call with a return.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
     const Triple &TT = getTargetMachine().getTargetTriple();
     if (GV->hasExternalWeakLinkage() &&
         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
       return false;
   }
 
   // Now we search for cases where we can use a tail call without changing the
   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
   // concept.
 
   // I want anyone implementing a new calling convention to think long and hard
   // about this assert.
   assert((!isVarArg || CalleeCC == CallingConv::C) &&
          "Unexpected variadic calling convention");
 
   LLVMContext &C = *DAG.getContext();
   if (isVarArg && !Outs.empty()) {
     // At least two cases here: if caller is fastcc then we can't have any
     // memory arguments (we'd be expected to clean up the stack afterwards). If
     // caller is C then we could potentially use its argument area.
 
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
     for (const CCValAssign &ArgLoc : ArgLocs)
       if (!ArgLoc.isRegLoc())
         return false;
   }
 
   // Check that the call results are passed in the same way.
   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
                                   CCAssignFnForCall(CalleeCC, isVarArg),
                                   CCAssignFnForCall(CallerCC, isVarArg)))
     return false;
   // The callee has to preserve all registers the caller needs to preserve.
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
   if (!CCMatch) {
     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
       return false;
   }
 
   // Nothing more to check if the callee is taking no arguments
   if (Outs.empty())
     return true;
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
 
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
 
   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
   // If the stack arguments for this call do not fit into our own save area then
   // the call cannot be made tail.
   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
     return false;
 
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
     return false;
 
   return true;
 }
 
 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
                                                    SelectionDAG &DAG,
                                                    MachineFrameInfo &MFI,
                                                    int ClobberedFI) const {
   SmallVector<SDValue, 8> ArgChains;
   int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
   int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
 
   // Include the original chain at the beginning of the list. When this is
   // used by target LowerCall hooks, this helps legalize find the
   // CALLSEQ_BEGIN node.
   ArgChains.push_back(Chain);
 
   // Add a chain value for each stack argument corresponding
   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
                             UE = DAG.getEntryNode().getNode()->use_end();
        U != UE; ++U)
     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
         if (FI->getIndex() < 0) {
           int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
           int64_t InLastByte = InFirstByte;
           InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
 
           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
             ArgChains.push_back(SDValue(L, 1));
         }
 
   // Build a tokenfactor for all the chains.
   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
 }
 
 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
                                                    bool TailCallOpt) const {
   return CallCC == CallingConv::Fast && TailCallOpt;
 }
 
 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
 /// and add input and output parameter nodes.
 SDValue
 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
   SDLoc &DL = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
   bool &IsTailCall = CLI.IsTailCall;
   CallingConv::ID CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
   bool IsThisReturn = false;
 
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
   bool IsSibCall = false;
 
   if (IsTailCall) {
     // Check if it's really possible to do a tail call.
     IsTailCall = isEligibleForTailCallOptimization(
         Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
     if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
       report_fatal_error("failed to perform tail call elimination on a call "
                          "site marked musttail");
 
     // A sibling call is one where we're under the usual C ABI and not planning
     // to change that but can still do a tail call:
     if (!TailCallOpt && IsTailCall)
       IsSibCall = true;
 
     if (IsTailCall)
       ++NumTailCalls;
   }
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
   if (IsVarArg) {
     // Handle fixed and variable vector arguments differently.
     // Variable vector arguments always go into memory.
     unsigned NumArgs = Outs.size();
 
     for (unsigned i = 0; i != NumArgs; ++i) {
       MVT ArgVT = Outs[i].VT;
       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
                                                /*IsVarArg=*/ !Outs[i].IsFixed);
       bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
       assert(!Res && "Call operand has unhandled type");
       (void)Res;
     }
   } else {
     // At this point, Outs[].VT may already be promoted to i32. To correctly
     // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
     // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
     // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
     // we use a special version of AnalyzeCallOperands to pass in ValVT and
     // LocVT.
     unsigned NumArgs = Outs.size();
     for (unsigned i = 0; i != NumArgs; ++i) {
       MVT ValVT = Outs[i].VT;
       // Get type of the original argument.
       EVT ActualVT = getValueType(DAG.getDataLayout(),
                                   CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
                                   /*AllowUnknown*/ true);
       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
         ValVT = MVT::i8;
       else if (ActualMVT == MVT::i16)
         ValVT = MVT::i16;
 
       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
       bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
       assert(!Res && "Call operand has unhandled type");
       (void)Res;
     }
   }
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
   if (IsSibCall) {
     // Since we're not changing the ABI to make this a tail call, the memory
     // operands are already available in the caller's incoming argument space.
     NumBytes = 0;
   }
 
   // FPDiff is the byte offset of the call's argument area from the callee's.
   // Stores to callee stack arguments will be placed in FixedStackSlots offset
   // by this amount for a tail call. In a sibling call it must be 0 because the
   // caller will deallocate the entire stack and the callee still expects its
   // arguments to begin at SP+0. Completely unused for non-tail calls.
   int FPDiff = 0;
 
   if (IsTailCall && !IsSibCall) {
     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
 
     // Since callee will pop argument stack as a tail call, we must keep the
     // popped size 16-byte aligned.
     NumBytes = alignTo(NumBytes, 16);
 
     // FPDiff will be negative if this tail call requires more space than we
     // would automatically have in our incoming argument space. Positive if we
     // can actually shrink the stack.
     FPDiff = NumReusableBytes - NumBytes;
 
     // The stack pointer must be 16-byte aligned at all times it's used for a
     // memory operation, which in practice means at *all* times and in
     // particular across call boundaries. Therefore our own arguments started at
     // a 16-byte aligned SP and the delta applied for the tail call should
     // satisfy the same constraint.
     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
   }
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall)
     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
                                         getPointerTy(DAG.getDataLayout()));
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
        ++i, ++realArgIdx) {
     CCValAssign &VA = ArgLocs[i];
     SDValue Arg = OutVals[realArgIdx];
     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
     default:
       llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full:
       break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::ZExt:
       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::AExt:
       if (Outs[realArgIdx].ArgVT == MVT::i1) {
         // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
       }
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::FPExt:
       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     }
 
     if (VA.isRegLoc()) {
       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
           Outs[0].VT == MVT::i64) {
         assert(VA.getLocVT() == MVT::i64 &&
                "unexpected calling convention register assignment");
         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
                "unexpected use of 'returned'");
         IsThisReturn = true;
       }
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else {
       assert(VA.isMemLoc());
 
       SDValue DstAddr;
       MachinePointerInfo DstInfo;
 
       // FIXME: This works on big-endian for composite byvals, which are the
       // common case. It should also work for fundamental types too.
       uint32_t BEAlign = 0;
       unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
                                         : VA.getValVT().getSizeInBits();
       OpSize = (OpSize + 7) / 8;
       if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
           !Flags.isInConsecutiveRegs()) {
         if (OpSize < 8)
           BEAlign = 8 - OpSize;
       }
       unsigned LocMemOffset = VA.getLocMemOffset();
       int32_t Offset = LocMemOffset + BEAlign;
       SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
       PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
 
       if (IsTailCall) {
         Offset = Offset + FPDiff;
         int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
 
         DstAddr = DAG.getFrameIndex(FI, PtrVT);
         DstInfo =
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
 
         // Make sure any stack arguments overlapping with where we're storing
         // are loaded before this eventual operation. Otherwise they'll be
         // clobbered.
         Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
       } else {
         SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
 
         DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
         DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
                                                LocMemOffset);
       }
 
       if (Outs[i].Flags.isByVal()) {
         SDValue SizeNode =
             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
         SDValue Cpy = DAG.getMemcpy(
             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
             /*isVol = */ false, /*AlwaysInline = */ false,
             /*isTailCall = */ false,
             DstInfo, MachinePointerInfo());
 
         MemOpChains.push_back(Cpy);
       } else {
         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
         // promoted to a legal register type i32, we should truncate Arg back to
         // i1/i8/i16.
         if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
             VA.getValVT() == MVT::i16)
           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
 
         SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
         MemOpChains.push_back(Store);
       }
     }
   }
 
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
   for (auto &RegToPass : RegsToPass) {
     Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
                              RegToPass.second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
   if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     auto GV = G->getGlobal();
     if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
         AArch64II::MO_GOT) {
       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
     } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
       assert(Subtarget->isTargetWindows() &&
              "Windows is the only supported COFF target");
       Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
     } else {
       const GlobalValue *GV = G->getGlobal();
       Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
     }
   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     if (getTargetMachine().getCodeModel() == CodeModel::Large &&
         Subtarget->isTargetMachO()) {
       const char *Sym = S->getSymbol();
       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
     } else {
       const char *Sym = S->getSymbol();
       Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
     }
   }
 
   // We don't usually want to end the call-sequence here because we would tidy
   // the frame up *after* the call, however in the ABI-changing tail-call case
   // we've carefully laid out the parameters so that when sp is reset they'll be
   // in the correct location.
   if (IsTailCall && !IsSibCall) {
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
                                DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
     InFlag = Chain.getValue(1);
   }
 
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
 
   if (IsTailCall) {
     // Each tail call may have to adjust the stack by a different amount, so
     // this information must travel along with the operation for eventual
     // consumption by emitEpilogue.
     Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
   }
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (auto &RegToPass : RegsToPass)
     Ops.push_back(DAG.getRegister(RegToPass.first,
                                   RegToPass.second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask;
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   if (IsThisReturn) {
     // For 'this' returns, use the X0-preserving mask if applicable
     Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
     if (!Mask) {
       IsThisReturn = false;
       Mask = TRI->getCallPreservedMask(MF, CallConv);
     }
   } else
     Mask = TRI->getCallPreservedMask(MF, CallConv);
 
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
   // If we're doing a tall call, use a TC_RETURN here rather than an
   // actual call instruction.
   if (IsTailCall) {
     MF.getFrameInfo().setHasTailCall();
     return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
   }
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   uint64_t CalleePopBytes =
       DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
                              DAG.getIntPtrConstant(CalleePopBytes, DL, true),
                              InFlag, DL);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
                          InVals, IsThisReturn,
                          IsThisReturn ? OutVals[0] : SDValue());
 }
 
 bool AArch64TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC);
 }
 
 SDValue
 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                    bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
                                    const SDLoc &DL, SelectionDAG &DAG) const {
   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC);
 
   // Copy the result values into the output registers.
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
        ++i, ++realRVLocIdx) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     SDValue Arg = OutVals[realRVLocIdx];
 
     switch (VA.getLocInfo()) {
     default:
       llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full:
       if (Outs[i].ArgVT == MVT::i1) {
         // AAPCS requires i1 to be zero-extended to i8 by the producer of the
         // value. This is strictly redundant on Darwin (which uses "zeroext
         // i1"), but will be optimised out before ISel.
         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
       }
       break;
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
       break;
     }
 
     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   const MCPhysReg *I =
       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
   if (I) {
     for (; *I; ++I) {
       if (AArch64::GPR64RegClass.contains(*I))
         RetOps.push_back(DAG.getRegister(*I, MVT::i64));
       else if (AArch64::FPR64RegClass.contains(*I))
         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
       else
         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
     }
   }
 
   RetOps[0] = Chain; // Update chain.
 
   // Add the flag if we have it.
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
   return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
 //===----------------------------------------------------------------------===//
 //  Other Lowering Code
 //===----------------------------------------------------------------------===//
 
 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
                                              SelectionDAG &DAG,
                                              unsigned Flag) const {
   return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
                                     N->getOffset(), Flag);
 }
 
 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
                                              SelectionDAG &DAG,
                                              unsigned Flag) const {
   return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
 }
 
 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
                                              SelectionDAG &DAG,
                                              unsigned Flag) const {
   return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
                                    N->getOffset(), Flag);
 }
 
 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
                                              SelectionDAG &DAG,
                                              unsigned Flag) const {
   return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
 }
 
 // (loadGOT sym)
 template <class NodeTy>
 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
                                       unsigned Flags) const {
   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
   SDLoc DL(N);
   EVT Ty = getPointerTy(DAG.getDataLayout());
   SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
   // FIXME: Once remat is capable of dealing with instructions with register
   // operands, expand this into two nodes instead of using a wrapper node.
   return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
 }
 
 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
 template <class NodeTy>
 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
                                             unsigned Flags) const {
   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
   SDLoc DL(N);
   EVT Ty = getPointerTy(DAG.getDataLayout());
   const unsigned char MO_NC = AArch64II::MO_NC;
   return DAG.getNode(
       AArch64ISD::WrapperLarge, DL, Ty,
       getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
       getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
       getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
       getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
 }
 
 // (addlow (adrp %hi(sym)) %lo(sym))
 template <class NodeTy>
 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
                                        unsigned Flags) const {
   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
   SDLoc DL(N);
   EVT Ty = getPointerTy(DAG.getDataLayout());
   SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
   SDValue Lo = getTargetNode(N, Ty, DAG,
                              AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
   return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
 }
 
 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                   SelectionDAG &DAG) const {
   GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
   const AArch64II::TOF TargetFlags =
       (GV->hasDLLImportStorageClass() ? AArch64II::MO_DLLIMPORT
                                       : AArch64II::MO_NO_FLAG);
   unsigned char OpFlags =
       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
 
   if (OpFlags != AArch64II::MO_NO_FLAG)
     assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
            "unexpected offset in global node");
 
   // This also catches the large code model case for Darwin.
   if ((OpFlags & AArch64II::MO_GOT) != 0) {
     return getGOT(GN, DAG, TargetFlags);
   }
 
   SDValue Result;
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     Result = getAddrLarge(GN, DAG, TargetFlags);
   } else {
     Result = getAddr(GN, DAG, TargetFlags);
   }
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(GN);
   if (GV->hasDLLImportStorageClass())
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   return Result;
 }
 
 /// Convert a TLS address reference into the correct sequence of loads
 /// and calls to compute the variable's address (for Darwin, currently) and
 /// return an SDValue containing the final node.
 
 /// Darwin only has one TLS scheme which must be capable of dealing with the
 /// fully general situation, in the worst case. This means:
 ///     + "extern __thread" declaration.
 ///     + Defined in a possibly unknown dynamic library.
 ///
 /// The general system is that each __thread variable has a [3 x i64] descriptor
 /// which contains information used by the runtime to calculate the address. The
 /// only part of this the compiler needs to know about is the first xword, which
 /// contains a function pointer that must be called with the address of the
 /// entire descriptor in "x0".
 ///
 /// Since this descriptor may be in a different unit, in general even the
 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
 /// is:
 ///     adrp x0, _var@TLVPPAGE
 ///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
 ///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
 ///                                      ; the function pointer
 ///     blr x1                           ; Uses descriptor address in x0
 ///     ; Address of _var is now in x0.
 ///
 /// If the address of _var's descriptor *is* known to the linker, then it can
 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
 /// a slight efficiency gain.
 SDValue
 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
                                                    SelectionDAG &DAG) const {
   assert(Subtarget->isTargetDarwin() &&
          "This function expects a Darwin target");
 
   SDLoc DL(Op);
   MVT PtrVT = getPointerTy(DAG.getDataLayout());
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
   SDValue TLVPAddr =
       DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
   SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
 
   // The first entry in the descriptor is a function pointer that we must call
   // to obtain the address of the variable.
   SDValue Chain = DAG.getEntryNode();
   SDValue FuncTLVGet = DAG.getLoad(
       MVT::i64, DL, Chain, DescAddr,
       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
       /* Alignment = */ 8,
       MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
           MachineMemOperand::MODereferenceable);
   Chain = FuncTLVGet.getValue(1);
 
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   MFI.setAdjustsStack(true);
 
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   // silly).
   const uint32_t *Mask =
       Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
 
   // Finally, we can make the call. This is just a degenerate version of a
   // normal AArch64 call node: x0 takes the address of the descriptor, and
   // returns the address of the variable in this thread.
   Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
   Chain =
       DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
                   Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
                   DAG.getRegisterMask(Mask), Chain.getValue(1));
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
 }
 
 /// When accessing thread-local variables under either the general-dynamic or
 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
 /// is a function pointer to carry out the resolution.
 ///
 /// The sequence is:
 ///    adrp  x0, :tlsdesc:var
 ///    ldr   x1, [x0, #:tlsdesc_lo12:var]
 ///    add   x0, x0, #:tlsdesc_lo12:var
 ///    .tlsdesccall var
 ///    blr   x1
 ///    (TPIDR_EL0 offset now in x0)
 ///
 ///  The above sequence must be produced unscheduled, to enable the linker to
 ///  optimize/relax this sequence.
 ///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
 ///  above sequence, and expanded really late in the compilation flow, to ensure
 ///  the sequence is produced as per above.
 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
                                                       const SDLoc &DL,
                                                       SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue Chain = DAG.getEntryNode();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
   Chain =
       DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
   SDValue Glue = Chain.getValue(1);
 
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
 }
 
 SDValue
 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
   assert(Subtarget->useSmallAddressing() &&
          "ELF TLS only supported in small memory model");
   // Different choices can be made for the maximum size of the TLS area for a
   // module. For the small address model, the default TLS size is 16MiB and the
   // maximum TLS size is 4GiB.
   // FIXME: add -mtls-size command line option and make it control the 16MiB
   // vs. 4GiB code sequence generation.
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
 
   if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
     if (Model == TLSModel::LocalDynamic)
       Model = TLSModel::GeneralDynamic;
   }
 
   SDValue TPOff;
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
   const GlobalValue *GV = GA->getGlobal();
 
   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
 
   if (Model == TLSModel::LocalExec) {
     SDValue HiVar = DAG.getTargetGlobalAddress(
         GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
     SDValue LoVar = DAG.getTargetGlobalAddress(
         GV, DL, PtrVT, 0,
         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
 
     SDValue TPWithOff_lo =
         SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
                                    HiVar,
                                    DAG.getTargetConstant(0, DL, MVT::i32)),
                 0);
     SDValue TPWithOff =
         SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
                                    LoVar,
                                    DAG.getTargetConstant(0, DL, MVT::i32)),
                 0);
     return TPWithOff;
   } else if (Model == TLSModel::InitialExec) {
     TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
     TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
   } else if (Model == TLSModel::LocalDynamic) {
     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
     // the beginning of the module's TLS region, followed by a DTPREL offset
     // calculation.
 
     // These accesses will need deduplicating if there's more than one.
     AArch64FunctionInfo *MFI =
         DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
     MFI->incNumLocalDynamicTLSAccesses();
 
     // The call needs a relocation too for linker relaxation. It doesn't make
     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
     // the address.
     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
                                                   AArch64II::MO_TLS);
 
     // Now we can calculate the offset from TPIDR_EL0 to this module's
     // thread-local area.
     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
 
     // Now use :dtprel_whatever: operations to calculate this variable's offset
     // in its thread-storage area.
     SDValue HiVar = DAG.getTargetGlobalAddress(
         GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
     SDValue LoVar = DAG.getTargetGlobalAddress(
         GV, DL, MVT::i64, 0,
         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
 
     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
                                        DAG.getTargetConstant(0, DL, MVT::i32)),
                     0);
     TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
                                        DAG.getTargetConstant(0, DL, MVT::i32)),
                     0);
   } else if (Model == TLSModel::GeneralDynamic) {
     // The call needs a relocation too for linker relaxation. It doesn't make
     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
     // the address.
     SDValue SymAddr =
         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
 
     // Finally we can make a call to calculate the offset from tpidr_el0.
     TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
   } else
     llvm_unreachable("Unsupported ELF TLS access model");
 
   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
 }
 
 SDValue
 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
                                                     SelectionDAG &DAG) const {
   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
 
   SDValue Chain = DAG.getEntryNode();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
 
   SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
 
   // Load the ThreadLocalStoragePointer from the TEB
   // A pointer to the TLS array is located at offset 0x58 from the TEB.
   SDValue TLSArray =
       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
   Chain = TLSArray.getValue(1);
 
   // Load the TLS index from the C runtime;
   // This does the same as getAddr(), but without having a GlobalAddressSDNode.
   // This also does the same as LOADgot, but using a generic i32 load,
   // while LOADgot only loads i64.
   SDValue TLSIndexHi =
       DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
   SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
       "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
   SDValue TLSIndex =
       DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
   TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
   Chain = TLSIndex.getValue(1);
 
   // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
   // offset into the TLSArray.
   TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
                              DAG.getConstant(3, DL, PtrVT));
   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
                             MachinePointerInfo());
   Chain = TLS.getValue(1);
 
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GA->getGlobal();
   SDValue TGAHi = DAG.getTargetGlobalAddress(
       GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
   SDValue TGALo = DAG.getTargetGlobalAddress(
       GV, DL, PtrVT, 0,
       AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
 
   // Add the offset from the start of the .tls section (section base).
   SDValue Addr =
       SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
                                  DAG.getTargetConstant(0, DL, MVT::i32)),
               0);
   Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
   return Addr;
 }
 
 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                      SelectionDAG &DAG) const {
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(GA, DAG);
 
   if (Subtarget->isTargetDarwin())
     return LowerDarwinGlobalTLSAddress(Op, DAG);
   if (Subtarget->isTargetELF())
     return LowerELFGlobalTLSAddress(Op, DAG);
   if (Subtarget->isTargetWindows())
     return LowerWindowsGlobalTLSAddress(Op, DAG);
 
   llvm_unreachable("Unexpected platform trying to use TLS");
 }
 
 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
   SDValue LHS = Op.getOperand(2);
   SDValue RHS = Op.getOperand(3);
   SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
 
   // Handle f128 first, since lowering it will result in comparing the return
   // value of a libcall against zero, which is just what the rest of LowerBR_CC
   // is expecting to deal with.
   if (LHS.getValueType() == MVT::f128) {
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
     // If softenSetCCOperands returned a scalar, we need to compare the result
     // against zero to select between true and false values.
     if (!RHS.getNode()) {
       RHS = DAG.getConstant(0, dl, LHS.getValueType());
       CC = ISD::SETNE;
     }
   }
 
   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
   // instruction.
   if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
       return SDValue();
 
     // The actual operation with overflow check.
     AArch64CC::CondCode OFCC;
     SDValue Value, Overflow;
     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
 
     if (CC == ISD::SETNE)
       OFCC = getInvertedCondCode(OFCC);
     SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
 
     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
                        Overflow);
   }
 
   if (LHS.getValueType().isInteger()) {
     assert((LHS.getValueType() == RHS.getValueType()) &&
            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
 
     // If the RHS of the comparison is zero, we can potentially fold this
     // to a specialized branch.
     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
     if (RHSC && RHSC->getZExtValue() == 0) {
       if (CC == ISD::SETEQ) {
         // See if we can use a TBZ to fold in an AND as well.
         // TBZ has a smaller branch displacement than CBZ.  If the offset is
         // out of bounds, a late MI-layer pass rewrites branches.
         // 403.gcc is an example that hits this case.
         if (LHS.getOpcode() == ISD::AND &&
             isa<ConstantSDNode>(LHS.getOperand(1)) &&
             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
           SDValue Test = LHS.getOperand(0);
           uint64_t Mask = LHS.getConstantOperandVal(1);
           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
                              Dest);
         }
 
         return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
       } else if (CC == ISD::SETNE) {
         // See if we can use a TBZ to fold in an AND as well.
         // TBZ has a smaller branch displacement than CBZ.  If the offset is
         // out of bounds, a late MI-layer pass rewrites branches.
         // 403.gcc is an example that hits this case.
         if (LHS.getOpcode() == ISD::AND &&
             isa<ConstantSDNode>(LHS.getOperand(1)) &&
             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
           SDValue Test = LHS.getOperand(0);
           uint64_t Mask = LHS.getConstantOperandVal(1);
           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
                              DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
                              Dest);
         }
 
         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
       } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
         // Don't combine AND since emitComparison converts the AND to an ANDS
         // (a.k.a. TST) and the test in the test bit and branch instruction
         // becomes redundant.  This would also increase register pressure.
         uint64_t Mask = LHS.getValueSizeInBits() - 1;
         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
                            DAG.getConstant(Mask, dl, MVT::i64), Dest);
       }
     }
     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
         LHS.getOpcode() != ISD::AND) {
       // Don't combine AND since emitComparison converts the AND to an ANDS
       // (a.k.a. TST) and the test in the test bit and branch instruction
       // becomes redundant.  This would also increase register pressure.
       uint64_t Mask = LHS.getValueSizeInBits() - 1;
       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
                          DAG.getConstant(Mask, dl, MVT::i64), Dest);
     }
 
     SDValue CCVal;
     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
                        Cmp);
   }
 
   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
          LHS.getValueType() == MVT::f64);
 
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   // clean.  Some of them require two branches to implement.
   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
   AArch64CC::CondCode CC1, CC2;
   changeFPCCToAArch64CC(CC, CC1, CC2);
   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
   SDValue BR1 =
       DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
   if (CC2 != AArch64CC::AL) {
     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
                        Cmp);
   }
 
   return BR1;
 }
 
 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
 
   SDValue In1 = Op.getOperand(0);
   SDValue In2 = Op.getOperand(1);
   EVT SrcVT = In2.getValueType();
 
   if (SrcVT.bitsLT(VT))
     In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
   else if (SrcVT.bitsGT(VT))
     In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
 
   EVT VecVT;
   uint64_t EltMask;
   SDValue VecVal1, VecVal2;
 
   auto setVecVal = [&] (int Idx) {
     if (!VT.isVector()) {
       VecVal1 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
                                           DAG.getUNDEF(VecVT), In1);
       VecVal2 = DAG.getTargetInsertSubreg(Idx, DL, VecVT,
                                           DAG.getUNDEF(VecVT), In2);
     } else {
       VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
       VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
     }
   };
 
   if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
     VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
     EltMask = 0x80000000ULL;
     setVecVal(AArch64::ssub);
   } else if (VT == MVT::f64 || VT == MVT::v2f64) {
     VecVT = MVT::v2i64;
 
     // We want to materialize a mask with the high bit set, but the AdvSIMD
     // immediate moves cannot materialize that in a single instruction for
     // 64-bit elements. Instead, materialize zero and then negate it.
     EltMask = 0;
 
     setVecVal(AArch64::dsub);
   } else if (VT == MVT::f16 || VT == MVT::v4f16 || VT == MVT::v8f16) {
     VecVT = (VT == MVT::v4f16 ? MVT::v4i16 : MVT::v8i16);
     EltMask = 0x8000ULL;
     setVecVal(AArch64::hsub);
   } else {
     llvm_unreachable("Invalid type for copysign!");
   }
 
   SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
 
   // If we couldn't materialize the mask above, then the mask vector will be
   // the zero vector, and we need to negate it here.
   if (VT == MVT::f64 || VT == MVT::v2f64) {
     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
     BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
   }
 
   SDValue Sel =
       DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
 
   if (VT == MVT::f16)
     return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, Sel);
   if (VT == MVT::f32)
     return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
   else if (VT == MVT::f64)
     return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
   else
     return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
 }
 
 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
           Attribute::NoImplicitFloat))
     return SDValue();
 
   if (!Subtarget->hasNEON())
     return SDValue();
 
   // While there is no integer popcount instruction, it can
   // be more efficiently lowered to the following sequence that uses
   // AdvSIMD registers/instructions as long as the copies to/from
   // the AdvSIMD registers are cheap.
   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
   SDValue Val = Op.getOperand(0);
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   if (VT == MVT::i32)
     Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
   Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
 
   SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
   SDValue UaddLV = DAG.getNode(
       ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
       DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
 
   if (VT == MVT::i64)
     UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
   return UaddLV;
 }
 
 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   if (Op.getValueType().isVector())
     return LowerVSETCC(Op, DAG);
 
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDLoc dl(Op);
 
   // We chose ZeroOrOneBooleanContents, so use zero and one.
   EVT VT = Op.getValueType();
   SDValue TVal = DAG.getConstant(1, dl, VT);
   SDValue FVal = DAG.getConstant(0, dl, VT);
 
   // Handle f128 first, since one possible outcome is a normal integer
   // comparison which gets picked up by the next if statement.
   if (LHS.getValueType() == MVT::f128) {
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
     // If softenSetCCOperands returned a scalar, use it.
     if (!RHS.getNode()) {
       assert(LHS.getValueType() == Op.getValueType() &&
              "Unexpected setcc expansion!");
       return LHS;
     }
   }
 
   if (LHS.getValueType().isInteger()) {
     SDValue CCVal;
     SDValue Cmp =
         getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
 
     // Note that we inverted the condition above, so we reverse the order of
     // the true and false operands here.  This will allow the setcc to be
     // matched to a single CSINC instruction.
     return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
   }
 
   // Now we know we're dealing with FP values.
   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
          LHS.getValueType() == MVT::f64);
 
   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
   // and do the comparison.
   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
 
   AArch64CC::CondCode CC1, CC2;
   changeFPCCToAArch64CC(CC, CC1, CC2);
   if (CC2 == AArch64CC::AL) {
     changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
 
     // Note that we inverted the condition above, so we reverse the order of
     // the true and false operands here.  This will allow the setcc to be
     // matched to a single CSINC instruction.
     return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
   } else {
     // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
     // totally clean.  Some of them require two CSELs to implement.  As is in
     // this case, we emit the first CSEL and then emit a second using the output
     // of the first as the RHS.  We're effectively OR'ing the two CC's together.
 
     // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
     SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
     SDValue CS1 =
         DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
 
     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
 }
 
 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
                                               SDValue RHS, SDValue TVal,
                                               SDValue FVal, const SDLoc &dl,
                                               SelectionDAG &DAG) const {
   // Handle f128 first, because it will result in a comparison of some RTLIB
   // call result against zero.
   if (LHS.getValueType() == MVT::f128) {
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
     // If softenSetCCOperands returned a scalar, we need to compare the result
     // against zero to select between true and false values.
     if (!RHS.getNode()) {
       RHS = DAG.getConstant(0, dl, LHS.getValueType());
       CC = ISD::SETNE;
     }
   }
 
   // Also handle f16, for which we need to do a f32 comparison.
   if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
     LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
     RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
   }
 
   // Next, handle integers.
   if (LHS.getValueType().isInteger()) {
     assert((LHS.getValueType() == RHS.getValueType()) &&
            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
 
     unsigned Opcode = AArch64ISD::CSEL;
 
     // If both the TVal and the FVal are constants, see if we can swap them in
     // order to for a CSINV or CSINC out of them.
     ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
     ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
 
     if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
       std::swap(TVal, FVal);
       std::swap(CTVal, CFVal);
       CC = ISD::getSetCCInverse(CC, true);
     } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
       std::swap(TVal, FVal);
       std::swap(CTVal, CFVal);
       CC = ISD::getSetCCInverse(CC, true);
     } else if (TVal.getOpcode() == ISD::XOR) {
       // If TVal is a NOT we want to swap TVal and FVal so that we can match
       // with a CSINV rather than a CSEL.
       if (isAllOnesConstant(TVal.getOperand(1))) {
         std::swap(TVal, FVal);
         std::swap(CTVal, CFVal);
         CC = ISD::getSetCCInverse(CC, true);
       }
     } else if (TVal.getOpcode() == ISD::SUB) {
       // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
       // that we can match with a CSNEG rather than a CSEL.
       if (isNullConstant(TVal.getOperand(0))) {
         std::swap(TVal, FVal);
         std::swap(CTVal, CFVal);
         CC = ISD::getSetCCInverse(CC, true);
       }
     } else if (CTVal && CFVal) {
       const int64_t TrueVal = CTVal->getSExtValue();
       const int64_t FalseVal = CFVal->getSExtValue();
       bool Swap = false;
 
       // If both TVal and FVal are constants, see if FVal is the
       // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
       // instead of a CSEL in that case.
       if (TrueVal == ~FalseVal) {
         Opcode = AArch64ISD::CSINV;
       } else if (TrueVal == -FalseVal) {
         Opcode = AArch64ISD::CSNEG;
       } else if (TVal.getValueType() == MVT::i32) {
         // If our operands are only 32-bit wide, make sure we use 32-bit
         // arithmetic for the check whether we can use CSINC. This ensures that
         // the addition in the check will wrap around properly in case there is
         // an overflow (which would not be the case if we do the check with
         // 64-bit arithmetic).
         const uint32_t TrueVal32 = CTVal->getZExtValue();
         const uint32_t FalseVal32 = CFVal->getZExtValue();
 
         if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
           Opcode = AArch64ISD::CSINC;
 
           if (TrueVal32 > FalseVal32) {
             Swap = true;
           }
         }
         // 64-bit check whether we can use CSINC.
       } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
         Opcode = AArch64ISD::CSINC;
 
         if (TrueVal > FalseVal) {
           Swap = true;
         }
       }
 
       // Swap TVal and FVal if necessary.
       if (Swap) {
         std::swap(TVal, FVal);
         std::swap(CTVal, CFVal);
         CC = ISD::getSetCCInverse(CC, true);
       }
 
       if (Opcode != AArch64ISD::CSEL) {
         // Drop FVal since we can get its value by simply inverting/negating
         // TVal.
         FVal = TVal;
       }
     }
 
     // Avoid materializing a constant when possible by reusing a known value in
     // a register.  However, don't perform this optimization if the known value
     // is one, zero or negative one in the case of a CSEL.  We can always
     // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
     // FVal, respectively.
     ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
     if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
         !RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
       // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
       // "a != C ? x : a" to avoid materializing C.
       if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
         TVal = LHS;
       else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
         FVal = LHS;
     } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
       assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
       // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
       // avoid materializing C.
       AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
       if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
         Opcode = AArch64ISD::CSINV;
         TVal = LHS;
         FVal = DAG.getConstant(0, dl, FVal.getValueType());
       }
     }
 
     SDValue CCVal;
     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
     EVT VT = TVal.getValueType();
     return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
   }
 
   // Now we know we're dealing with FP values.
   assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
          LHS.getValueType() == MVT::f64);
   assert(LHS.getValueType() == RHS.getValueType());
   EVT VT = TVal.getValueType();
   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
 
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   // clean.  Some of them require two CSELs to implement.
   AArch64CC::CondCode CC1, CC2;
   changeFPCCToAArch64CC(CC, CC1, CC2);
 
   if (DAG.getTarget().Options.UnsafeFPMath) {
     // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
     // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
     ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
     if (RHSVal && RHSVal->isZero()) {
       ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
       ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
 
       if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
           CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
         TVal = LHS;
       else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
                CFVal && CFVal->isZero() &&
                FVal.getValueType() == LHS.getValueType())
         FVal = LHS;
     }
   }
 
   // Emit first, and possibly only, CSEL.
   SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
   SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
 
   // If we need a second CSEL, emit it, using the output of the first as the
   // RHS.  We're effectively OR'ing the two CC's together.
   if (CC2 != AArch64CC::AL) {
     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
 
   // Otherwise, return the output of the first CSEL.
   return CS1;
 }
 
 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
                                               SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue TVal = Op.getOperand(2);
   SDValue FVal = Op.getOperand(3);
   SDLoc DL(Op);
   return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue CCVal = Op->getOperand(0);
   SDValue TVal = Op->getOperand(1);
   SDValue FVal = Op->getOperand(2);
   SDLoc DL(Op);
 
   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
   // instruction.
   if (isOverflowIntrOpRes(CCVal)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
       return SDValue();
 
     AArch64CC::CondCode OFCC;
     SDValue Value, Overflow;
     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
     SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
 
     return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
                        CCVal, Overflow);
   }
 
   // Lower it the same way as we would lower a SELECT_CC node.
   ISD::CondCode CC;
   SDValue LHS, RHS;
   if (CCVal.getOpcode() == ISD::SETCC) {
     LHS = CCVal.getOperand(0);
     RHS = CCVal.getOperand(1);
     CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
   } else {
     LHS = CCVal;
     RHS = DAG.getConstant(0, DL, CCVal.getValueType());
     CC = ISD::SETNE;
   }
   return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
                                               SelectionDAG &DAG) const {
   // Jump table entries as PC relative offsets. No additional tweaking
   // is necessary here. Just get the address of the jump table.
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       !Subtarget->isTargetMachO()) {
     return getAddrLarge(JT, DAG);
   }
   return getAddr(JT, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     // Use the GOT for the large code model on iOS.
     if (Subtarget->isTargetMachO()) {
       return getGOT(CP, DAG);
     }
     return getAddrLarge(CP, DAG);
   } else {
     return getAddr(CP, DAG);
   }
 }
 
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       !Subtarget->isTargetMachO()) {
     return getAddrLarge(BA, DAG);
   } else {
     return getAddr(BA, DAG);
   }
 }
 
 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
                                                  SelectionDAG &DAG) const {
   AArch64FunctionInfo *FuncInfo =
       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
 
   SDLoc DL(Op);
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
                                  getPointerTy(DAG.getDataLayout()));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                       MachinePointerInfo(SV));
 }
 
 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
                                                   SelectionDAG &DAG) const {
   AArch64FunctionInfo *FuncInfo =
       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
 
   SDLoc DL(Op);
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
                                      ? FuncInfo->getVarArgsGPRIndex()
                                      : FuncInfo->getVarArgsStackIndex(),
                                  getPointerTy(DAG.getDataLayout()));
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                       MachinePointerInfo(SV));
 }
 
 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The layout of the va_list struct is specified in the AArch64 Procedure Call
   // Standard, section B.3.
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
 
   SDValue Chain = Op.getOperand(0);
   SDValue VAList = Op.getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   SmallVector<SDValue, 4> MemOps;
 
   // void *__stack at offset 0
   SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
                                 MachinePointerInfo(SV), /* Alignment = */ 8));
 
   // void *__gr_top at offset 8
   int GPRSize = FuncInfo->getVarArgsGPRSize();
   if (GPRSize > 0) {
     SDValue GRTop, GRTopAddr;
 
     GRTopAddr =
         DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
 
     GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
     GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
                         DAG.getConstant(GPRSize, DL, PtrVT));
 
     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
                                   MachinePointerInfo(SV, 8),
                                   /* Alignment = */ 8));
   }
 
   // void *__vr_top at offset 16
   int FPRSize = FuncInfo->getVarArgsFPRSize();
   if (FPRSize > 0) {
     SDValue VRTop, VRTopAddr;
     VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                             DAG.getConstant(16, DL, PtrVT));
 
     VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
     VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
                         DAG.getConstant(FPRSize, DL, PtrVT));
 
     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
                                   MachinePointerInfo(SV, 16),
                                   /* Alignment = */ 8));
   }
 
   // int __gr_offs at offset 24
   SDValue GROffsAddr =
       DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
   MemOps.push_back(DAG.getStore(
       Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
       MachinePointerInfo(SV, 24), /* Alignment = */ 4));
 
   // int __vr_offs at offset 28
   SDValue VROffsAddr =
       DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
   MemOps.push_back(DAG.getStore(
       Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
       MachinePointerInfo(SV, 28), /* Alignment = */ 4));
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
                                             SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
   if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
     return LowerWin64_VASTART(Op, DAG);
   else if (Subtarget->isTargetDarwin())
     return LowerDarwin_VASTART(Op, DAG);
   else
     return LowerAAPCS_VASTART(Op, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
                                            SelectionDAG &DAG) const {
   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
   // pointer.
   SDLoc DL(Op);
   unsigned VaListSize =
       Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
   return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
                        Op.getOperand(2),
                        DAG.getConstant(VaListSize, DL, MVT::i32),
                        8, false, false, false, MachinePointerInfo(DestSV),
                        MachinePointerInfo(SrcSV));
 }
 
 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->isTargetDarwin() &&
          "automatic va_arg instruction only works on Darwin");
 
   const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   SDValue Addr = Op.getOperand(1);
   unsigned Align = Op.getConstantOperandVal(3);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
   Chain = VAList.getValue(1);
 
   if (Align > 8) {
     assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                          DAG.getConstant(Align - 1, DL, PtrVT));
     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
                          DAG.getConstant(-(int64_t)Align, DL, PtrVT));
   }
 
   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
   uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
 
   // Scalar integer and FP values smaller than 64 bits are implicitly extended
   // up to 64 bits.  At the very least, we have to increase the striding of the
   // vaargs list to match this, and for FP values we need to introduce
   // FP_ROUND nodes as well.
   if (VT.isInteger() && !VT.isVector())
     ArgSize = 8;
   bool NeedFPTrunc = false;
   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
     ArgSize = 8;
     NeedFPTrunc = true;
   }
 
   // Increment the pointer, VAList, to the next vaarg
   SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
                                DAG.getConstant(ArgSize, DL, PtrVT));
   // Store the incremented VAList to the legalized pointer
   SDValue APStore =
       DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
 
   // Load the actual argument out of the pointer VAList
   if (NeedFPTrunc) {
     // Load the value as an f64.
     SDValue WideFP =
         DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
     // Round the value down to an f32.
     SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
                                    DAG.getIntPtrConstant(1, DL));
     SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
     // Merge the rounded value with the chain output of the load.
     return DAG.getMergeValues(Ops, DL);
   }
 
   return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
 }
 
 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
                                               SelectionDAG &DAG) const {
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   MFI.setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDValue FrameAddr =
       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
                             MachinePointerInfo());
   return FrameAddr;
 }
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                                   SelectionDAG &DAG) const {
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", AArch64::SP)
                        .Case("x18", AArch64::X18)
                        .Case("w18", AArch64::W18)
                        .Case("x20", AArch64::X20)
                        .Case("w20", AArch64::W20)
                        .Default(0);
   if (((Reg == AArch64::X18 || Reg == AArch64::W18) &&
       !Subtarget->isX18Reserved()) ||
       ((Reg == AArch64::X20 || Reg == AArch64::W20) &&
       !Subtarget->isX20Reserved()))
     Reg = 0;
   if (Reg)
     return Reg;
   report_fatal_error(Twine("Invalid register name \""
                               + StringRef(RegName)  + "\"."));
 }
 
 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
                                                SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setReturnAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
     return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
                        MachinePointerInfo());
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
   unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
 }
 
 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
                                                     SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
   SDLoc dl(Op);
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt = Op.getOperand(2);
   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
 
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
   SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
 
   // Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
   // is "undef". We wanted 0, so CSEL it directly.
   SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
                                ISD::SETEQ, dl, DAG);
   SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
   HiBitsForLo =
       DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
                   HiBitsForLo, CCVal, Cmp);
 
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                    DAG.getConstant(VTBits, dl, MVT::i64));
 
   SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
   SDValue LoForNormalShift =
       DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
 
   Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
                        dl, DAG);
   CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
   SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
   SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
                            LoForNormalShift, CCVal, Cmp);
 
   // AArch64 shifts larger than the register width are wrapped rather than
   // clamped, so we can't just emit "hi >> x".
   SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
   SDValue HiForBigShift =
       Opc == ISD::SRA
           ? DAG.getNode(Opc, dl, VT, ShOpHi,
                         DAG.getConstant(VTBits - 1, dl, MVT::i64))
           : DAG.getConstant(0, dl, VT);
   SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
                            HiForNormalShift, CCVal, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
 }
 
 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
                                                    SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
   SDLoc dl(Op);
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt = Op.getOperand(2);
 
   assert(Op.getOpcode() == ISD::SHL_PARTS);
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                  DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
   SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
 
   // Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
   // is "undef". We wanted 0, so CSEL it directly.
   SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
                                ISD::SETEQ, dl, DAG);
   SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
   LoBitsForHi =
       DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
                   LoBitsForHi, CCVal, Cmp);
 
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                    DAG.getConstant(VTBits, dl, MVT::i64));
   SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
   SDValue HiForNormalShift =
       DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
 
   SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
 
   Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
                        dl, DAG);
   CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
   SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
                            HiForNormalShift, CCVal, Cmp);
 
   // AArch64 shifts of larger than register sizes are wrapped rather than
   // clamped, so we can't just emit "lo << a" if a is too big.
   SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
   SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
   SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
                            LoForNormalShift, CCVal, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
 }
 
 bool AArch64TargetLowering::isOffsetFoldingLegal(
     const GlobalAddressSDNode *GA) const {
   // Offsets are folded in the DAG combine rather than here so that we can
   // intelligently choose an offset based on the uses.
   return false;
 }
 
 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
   // FIXME: We should be able to handle f128 as well with a clever lowering.
   if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32 ||
                           (VT == MVT::f16 && Subtarget->hasFullFP16()))) {
     LLVM_DEBUG(
         dbgs() << "Legal fp imm: materialize 0 using the zero register\n");
     return true;
   }
 
   StringRef FPType;
   bool IsLegal = false;
   SmallString<128> ImmStrVal;
   Imm.toString(ImmStrVal);
 
   if (VT == MVT::f64) {
     FPType = "f64";
     IsLegal = AArch64_AM::getFP64Imm(Imm) != -1;
   } else if (VT == MVT::f32) {
     FPType = "f32";
     IsLegal = AArch64_AM::getFP32Imm(Imm) != -1;
   } else if (VT == MVT::f16 && Subtarget->hasFullFP16()) {
     FPType = "f16";
     IsLegal = AArch64_AM::getFP16Imm(Imm) != -1;
   }
 
   if (IsLegal) {
     LLVM_DEBUG(dbgs() << "Legal " << FPType << " imm value: " << ImmStrVal
                       << "\n");
     return true;
   }
 
   if (!FPType.empty())
     LLVM_DEBUG(dbgs() << "Illegal " << FPType << " imm value: " << ImmStrVal
                       << "\n");
   else
     LLVM_DEBUG(dbgs() << "Illegal fp imm " << ImmStrVal
                       << ": unsupported fp type\n");
 
   return false;
 }
 
 //===----------------------------------------------------------------------===//
 //                          AArch64 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
                            SDValue Operand, SelectionDAG &DAG,
                            int &ExtraSteps) {
   EVT VT = Operand.getValueType();
   if (ST->hasNEON() &&
       (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
        VT == MVT::f32 || VT == MVT::v1f32 ||
        VT == MVT::v2f32 || VT == MVT::v4f32)) {
     if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
       // For the reciprocal estimates, convergence is quadratic, so the number
       // of digits is doubled after each iteration.  In ARMv8, the accuracy of
       // the initial estimate is 2^-8.  Thus the number of extra steps to refine
       // the result for float (23 mantissa bits) is 2 and for double (52
       // mantissa bits) is 3.
       ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
 
     return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
   }
 
   return SDValue();
 }
 
 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
                                                SelectionDAG &DAG, int Enabled,
                                                int &ExtraSteps,
                                                bool &UseOneConst,
                                                bool Reciprocal) const {
   if (Enabled == ReciprocalEstimate::Enabled ||
       (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
                                        DAG, ExtraSteps)) {
       SDLoc DL(Operand);
       EVT VT = Operand.getValueType();
 
       SDNodeFlags Flags;
       Flags.setAllowReassociation(true);
 
       // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
       // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
       for (int i = ExtraSteps; i > 0; --i) {
         SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
                                    Flags);
         Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
       if (!Reciprocal) {
         EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
                                       VT);
         SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
         SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
 
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
         // Correct the result if the operand is 0.0.
         Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
                                VT, Eq, Operand, Estimate);
       }
 
       ExtraSteps = 0;
       return Estimate;
     }
 
   return SDValue();
 }
 
 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
                                                 SelectionDAG &DAG, int Enabled,
                                                 int &ExtraSteps) const {
   if (Enabled == ReciprocalEstimate::Enabled)
     if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
                                        DAG, ExtraSteps)) {
       SDLoc DL(Operand);
       EVT VT = Operand.getValueType();
 
       SDNodeFlags Flags;
       Flags.setAllowReassociation(true);
 
       // Newton reciprocal iteration: E * (2 - X * E)
       // AArch64 reciprocal iteration instruction: (2 - M * N)
       for (int i = ExtraSteps; i > 0; --i) {
         SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
                                    Estimate, Flags);
         Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
       }
 
       ExtraSteps = 0;
       return Estimate;
     }
 
   return SDValue();
 }
 
 //===----------------------------------------------------------------------===//
 //                          AArch64 Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
 // Table of Constraints
 // TODO: This is the current set of constraints supported by ARM for the
 // compiler, not all of them may make sense.
 //
 // r - A general register
 // w - An FP/SIMD register of some size in the range v0-v31
 // x - An FP/SIMD register of some size in the range v0-v15
 // I - Constant that can be used with an ADD instruction
 // J - Constant that can be used with a SUB instruction
 // K - Constant that can be used with a 32-bit logical instruction
 // L - Constant that can be used with a 64-bit logical instruction
 // M - Constant that can be used as a 32-bit MOV immediate
 // N - Constant that can be used as a 64-bit MOV immediate
 // Q - A memory reference with base register and no offset
 // S - A symbolic address
 // Y - Floating point constant zero
 // Z - Integer constant zero
 //
 //   Note that general register operands will be output using their 64-bit x
 // register name, whatever the size of the variable, unless the asm operand
 // is prefixed by the %w modifier. Floating-point and SIMD register operands
 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
 // %q modifier.
 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
   // At this point, we have to lower this constraint to something else, so we
   // lower it to an "r" or "w". However, by doing this we will force the result
   // to be in register, while the X constraint is much more permissive.
   //
   // Although we are correct (we are free to emit anything, without
   // constraints), we might break use cases that would expect us to be more
   // efficient and emit something else.
   if (!Subtarget->hasFPARMv8())
     return "r";
 
   if (ConstraintVT.isFloatingPoint())
     return "w";
 
   if (ConstraintVT.isVector() &&
      (ConstraintVT.getSizeInBits() == 64 ||
       ConstraintVT.getSizeInBits() == 128))
     return "w";
 
   return "r";
 }
 
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 AArch64TargetLowering::ConstraintType
 AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:
       break;
     case 'z':
       return C_Other;
     case 'x':
     case 'w':
       return C_RegisterClass;
     // An address with a single base register. Due to the way we
     // currently handle addresses it is the same as 'r'.
     case 'Q':
       return C_Memory;
     case 'S': // A symbolic address
       return C_Other;
     }
   }
   return TargetLowering::getConstraintType(Constraint);
 }
 
 /// Examine constraint type and operand type and determine a weight value.
 /// This object must already have been set up with the operand type
 /// and the current alternative constraint selected.
 TargetLowering::ConstraintWeight
 AArch64TargetLowering::getSingleConstraintMatchWeight(
     AsmOperandInfo &info, const char *constraint) const {
   ConstraintWeight weight = CW_Invalid;
   Value *CallOperandVal = info.CallOperandVal;
   // If we don't have a value, we can't do a match,
   // but allow it at the lowest weight.
   if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
     break;
   case 'x':
   case 'w':
     if (type->isFloatingPointTy() || type->isVectorTy())
       weight = CW_Register;
     break;
   case 'z':
     weight = CW_Constant;
     break;
   }
   return weight;
 }
 
 std::pair<unsigned, const TargetRegisterClass *>
 AArch64TargetLowering::getRegForInlineAsmConstraint(
     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
       if (VT.getSizeInBits() == 64)
         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
     case 'w':
       if (VT.getSizeInBits() == 16)
         return std::make_pair(0U, &AArch64::FPR16RegClass);
       if (VT.getSizeInBits() == 32)
         return std::make_pair(0U, &AArch64::FPR32RegClass);
       if (VT.getSizeInBits() == 64)
         return std::make_pair(0U, &AArch64::FPR64RegClass);
       if (VT.getSizeInBits() == 128)
         return std::make_pair(0U, &AArch64::FPR128RegClass);
       break;
     // The instructions that this constraint is designed for can
     // only take 128-bit registers so just use that regclass.
     case 'x':
       if (VT.getSizeInBits() == 128)
         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
       break;
     }
   }
   if (StringRef("{cc}").equals_lower(Constraint))
     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
 
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
   std::pair<unsigned, const TargetRegisterClass *> Res;
   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
   // Not found as a standard register?
   if (!Res.second) {
     unsigned Size = Constraint.size();
     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
       int RegNo;
       bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
       if (!Failed && RegNo >= 0 && RegNo <= 31) {
         // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
         // By default we'll emit v0-v31 for this unless there's a modifier where
         // we'll emit the correct register as well.
         if (VT != MVT::Other && VT.getSizeInBits() == 64) {
           Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
           Res.second = &AArch64::FPR64RegClass;
         } else {
           Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
           Res.second = &AArch64::FPR128RegClass;
         }
       }
     }
   }
 
   return Res;
 }
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void AArch64TargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
   SDValue Result;
 
   // Currently only support length 1 constraints.
   if (Constraint.length() != 1)
     return;
 
   char ConstraintLetter = Constraint[0];
   switch (ConstraintLetter) {
   default:
     break;
 
   // This set of constraints deal with valid constants for various instructions.
   // Validate and return a target constant for them if we can.
   case 'z': {
     // 'z' maps to xzr or wzr so it needs an input of 0.
     if (!isNullConstant(Op))
       return;
 
     if (Op.getValueType() == MVT::i64)
       Result = DAG.getRegister(AArch64::XZR, MVT::i64);
     else
       Result = DAG.getRegister(AArch64::WZR, MVT::i32);
     break;
   }
   case 'S': {
     // An absolute symbolic address or label reference.
     if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
       Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
                                           GA->getValueType(0));
     } else if (const BlockAddressSDNode *BA =
                    dyn_cast<BlockAddressSDNode>(Op)) {
       Result =
           DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
     } else if (const ExternalSymbolSDNode *ES =
                    dyn_cast<ExternalSymbolSDNode>(Op)) {
       Result =
           DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0));
     } else
       return;
     break;
   }
 
   case 'I':
   case 'J':
   case 'K':
   case 'L':
   case 'M':
   case 'N':
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
     if (!C)
       return;
 
     // Grab the value and do some validation.
     uint64_t CVal = C->getZExtValue();
     switch (ConstraintLetter) {
     // The I constraint applies only to simple ADD or SUB immediate operands:
     // i.e. 0 to 4095 with optional shift by 12
     // The J constraint applies only to ADD or SUB immediates that would be
     // valid when negated, i.e. if [an add pattern] were to be output as a SUB
     // instruction [or vice versa], in other words -1 to -4095 with optional
     // left shift by 12.
     case 'I':
       if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
         break;
       return;
     case 'J': {
       uint64_t NVal = -C->getSExtValue();
       if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
         CVal = C->getSExtValue();
         break;
       }
       return;
     }
     // The K and L constraints apply *only* to logical immediates, including
     // what used to be the MOVI alias for ORR (though the MOVI alias has now
     // been removed and MOV should be used). So these constraints have to
     // distinguish between bit patterns that are valid 32-bit or 64-bit
     // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
     // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
     // versa.
     case 'K':
       if (AArch64_AM::isLogicalImmediate(CVal, 32))
         break;
       return;
     case 'L':
       if (AArch64_AM::isLogicalImmediate(CVal, 64))
         break;
       return;
     // The M and N constraints are a superset of K and L respectively, for use
     // with the MOV (immediate) alias. As well as the logical immediates they
     // also match 32 or 64-bit immediates that can be loaded either using a
     // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
     // (M) or 64-bit 0x1234000000000000 (N) etc.
     // As a note some of this code is liberally stolen from the asm parser.
     case 'M': {
       if (!isUInt<32>(CVal))
         return;
       if (AArch64_AM::isLogicalImmediate(CVal, 32))
         break;
       if ((CVal & 0xFFFF) == CVal)
         break;
       if ((CVal & 0xFFFF0000ULL) == CVal)
         break;
       uint64_t NCVal = ~(uint32_t)CVal;
       if ((NCVal & 0xFFFFULL) == NCVal)
         break;
       if ((NCVal & 0xFFFF0000ULL) == NCVal)
         break;
       return;
     }
     case 'N': {
       if (AArch64_AM::isLogicalImmediate(CVal, 64))
         break;
       if ((CVal & 0xFFFFULL) == CVal)
         break;
       if ((CVal & 0xFFFF0000ULL) == CVal)
         break;
       if ((CVal & 0xFFFF00000000ULL) == CVal)
         break;
       if ((CVal & 0xFFFF000000000000ULL) == CVal)
         break;
       uint64_t NCVal = ~CVal;
       if ((NCVal & 0xFFFFULL) == NCVal)
         break;
       if ((NCVal & 0xFFFF0000ULL) == NCVal)
         break;
       if ((NCVal & 0xFFFF00000000ULL) == NCVal)
         break;
       if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
         break;
       return;
     }
     default:
       return;
     }
 
     // All assembler immediates are 64-bit integers.
     Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
     break;
   }
 
   if (Result.getNode()) {
     Ops.push_back(Result);
     return;
   }
 
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 //===----------------------------------------------------------------------===//
 //                     AArch64 Advanced SIMD Support
 //===----------------------------------------------------------------------===//
 
 /// WidenVector - Given a value in the V64 register class, produce the
 /// equivalent value in the V128 register class.
 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
   EVT VT = V64Reg.getValueType();
   unsigned NarrowSize = VT.getVectorNumElements();
   MVT EltTy = VT.getVectorElementType().getSimpleVT();
   MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
   SDLoc DL(V64Reg);
 
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
                      V64Reg, DAG.getConstant(0, DL, MVT::i32));
 }
 
 /// getExtFactor - Determine the adjustment factor for the position when
 /// generating an "extract from vector registers" instruction.
 static unsigned getExtFactor(SDValue &V) {
   EVT EltType = V.getValueType().getVectorElementType();
   return EltType.getSizeInBits() / 8;
 }
 
 /// NarrowVector - Given a value in the V128 register class, produce the
 /// equivalent value in the V64 register class.
 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
   EVT VT = V128Reg.getValueType();
   unsigned WideSize = VT.getVectorNumElements();
   MVT EltTy = VT.getVectorElementType().getSimpleVT();
   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
   SDLoc DL(V128Reg);
 
   return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
 }
 
 // Gather data to see if the operation can be modelled as a
 // shuffle in combination with VEXTs.
 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
                                                   SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
   struct ShuffleSourceInfo {
     SDValue Vec;
     unsigned MinElt;
     unsigned MaxElt;
 
     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
     // be compatible with the shuffle we intend to construct. As a result
     // ShuffleVec will be some sliding window into the original Vec.
     SDValue ShuffleVec;
 
     // Code should guarantee that element i in Vec starts at element "WindowBase
     // + i * WindowScale in ShuffleVec".
     int WindowBase;
     int WindowScale;
 
     ShuffleSourceInfo(SDValue Vec)
       : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
           ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
 
     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
   };
 
   // First gather all vectors used as an immediate source for this BUILD_VECTOR
   // node.
   SmallVector<ShuffleSourceInfo, 2> Sources;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.isUndef())
       continue;
     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
              !isa<ConstantSDNode>(V.getOperand(1))) {
       LLVM_DEBUG(
           dbgs() << "Reshuffle failed: "
                     "a shuffle can only come from building a vector from "
                     "various elements of other vectors, provided their "
                     "indices are constant\n");
       return SDValue();
     }
 
     // Add this element source to the list if it's not already there.
     SDValue SourceVec = V.getOperand(0);
     auto Source = find(Sources, SourceVec);
     if (Source == Sources.end())
       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
     // Update the minimum and maximum lane number seen.
     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
     Source->MinElt = std::min(Source->MinElt, EltNo);
     Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
 
   if (Sources.size() > 2) {
     LLVM_DEBUG(
         dbgs() << "Reshuffle failed: currently only do something sane when at "
                   "most two source vectors are involved\n");
     return SDValue();
   }
 
   // Find out the smallest element size among result and two sources, and use
   // it as element size to build the shuffle_vector.
   EVT SmallestEltTy = VT.getVectorElementType();
   for (auto &Source : Sources) {
     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
     if (SrcEltTy.bitsLT(SmallestEltTy)) {
       SmallestEltTy = SrcEltTy;
     }
   }
   unsigned ResMultiplier =
       VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
 
   // If the source vector is too wide or too narrow, we may nevertheless be able
   // to construct a compatible shuffle either by concatenating it with UNDEF or
   // extracting a suitable range of elements.
   for (auto &Src : Sources) {
     EVT SrcVT = Src.ShuffleVec.getValueType();
 
     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
       continue;
 
     // This stage of the search produces a source with the same element type as
     // the original, but with a total width matching the BUILD_VECTOR output.
     EVT EltVT = SrcVT.getVectorElementType();
     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
 
     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
       assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
       Src.ShuffleVec =
           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
       continue;
     }
 
     assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
 
     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
       LLVM_DEBUG(
           dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
       return SDValue();
     }
 
     if (Src.MinElt >= NumSrcElts) {
       // The extraction can just take the second half
       Src.ShuffleVec =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
       Src.WindowBase = -NumSrcElts;
     } else if (Src.MaxElt < NumSrcElts) {
       // The extraction can just take the first half
       Src.ShuffleVec =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(0, dl, MVT::i64));
     } else {
       // An actual VEXT is needed
       SDValue VEXTSrc1 =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(0, dl, MVT::i64));
       SDValue VEXTSrc2 =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(NumSrcElts, dl, MVT::i64));
       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
 
       Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
                                    VEXTSrc2,
                                    DAG.getConstant(Imm, dl, MVT::i32));
       Src.WindowBase = -Src.MinElt;
     }
   }
 
   // Another possible incompatibility occurs from the vector element types. We
   // can fix this by bitcasting the source vectors to the same type we intend
   // for the shuffle.
   for (auto &Src : Sources) {
     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
     if (SrcEltTy == SmallestEltTy)
       continue;
     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
     Src.WindowBase *= Src.WindowScale;
   }
 
   // Final sanity check before we try to actually produce a shuffle.
   LLVM_DEBUG(for (auto Src
                   : Sources)
                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
 
   // The stars all align, our next step is to produce the mask for the shuffle.
   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
     SDValue Entry = Op.getOperand(i);
     if (Entry.isUndef())
       continue;
 
     auto Src = find(Sources, Entry.getOperand(0));
     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
 
     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
     // segment.
     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
     int BitsDefined =
         std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
     int LanesDefined = BitsDefined / BitsPerShuffleLane;
 
     // This source is expected to fill ResMultiplier lanes of the final shuffle,
     // starting at the appropriate offset.
     int *LaneMask = &Mask[i * ResMultiplier];
 
     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
     ExtractBase += NumElts * (Src - Sources.begin());
     for (int j = 0; j < LanesDefined; ++j)
       LaneMask[j] = ExtractBase + j;
   }
 
   // Final check before we try to produce nonsense...
   if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
     LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
     return SDValue();
   }
 
   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
   for (unsigned i = 0; i < Sources.size(); ++i)
     ShuffleOps[i] = Sources[i].ShuffleVec;
 
   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
                                          ShuffleOps[1], Mask);
   SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 
   LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
              dbgs() << "Reshuffle, creating node: "; V.dump(););
 
   return V;
 }
 
 // check if an EXT instruction can handle the shuffle mask when the
 // vector sources of the shuffle are the same.
 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
   if (M[0] < 0)
     return false;
 
   Imm = M[0];
 
   // If this is a VEXT shuffle, the immediate value is the index of the first
   // element.  The other shuffle indices must be the successive elements after
   // the first one.
   unsigned ExpectedElt = Imm;
   for (unsigned i = 1; i < NumElts; ++i) {
     // Increment the expected index.  If it wraps around, just follow it
     // back to index zero and keep going.
     ++ExpectedElt;
     if (ExpectedElt == NumElts)
       ExpectedElt = 0;
 
     if (M[i] < 0)
       continue; // ignore UNDEF indices
     if (ExpectedElt != static_cast<unsigned>(M[i]))
       return false;
   }
 
   return true;
 }
 
 // check if an EXT instruction can handle the shuffle mask when the
 // vector sources of the shuffle are different.
 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
                       unsigned &Imm) {
   // Look for the first non-undef element.
   const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
 
   // Benefit form APInt to handle overflow when calculating expected element.
   unsigned NumElts = VT.getVectorNumElements();
   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
   APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
   // The following shuffle indices must be the successive elements after the
   // first real element.
   const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
       [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
   if (FirstWrongElt != M.end())
     return false;
 
   // The index of an EXT is the first element if it is not UNDEF.
   // Watch out for the beginning UNDEFs. The EXT index should be the expected
   // value of the first element.  E.g.
   // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
   // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
   // ExpectedElt is the last mask index plus 1.
   Imm = ExpectedElt.getZExtValue();
 
   // There are two difference cases requiring to reverse input vectors.
   // For example, for vector <4 x i32> we have the following cases,
   // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
   // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
   // For both cases, we finally use mask <5, 6, 7, 0>, which requires
   // to reverse two input vectors.
   if (Imm < NumElts)
     ReverseEXT = true;
   else
     Imm -= NumElts;
 
   return true;
 }
 
 /// isREVMask - Check if a vector shuffle corresponds to a REV
 /// instruction with the specified blocksize.  (The order of the elements
 /// within each block of the vector is reversed.)
 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
          "Only possible block sizes for REV are: 16, 32, 64");
 
   unsigned EltSz = VT.getScalarSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned BlockElts = M[0] + 1;
   // If the first shuffle index is UNDEF, be optimistic.
   if (M[0] < 0)
     BlockElts = BlockSize / EltSz;
 
   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
     return false;
 
   for (unsigned i = 0; i < NumElts; ++i) {
     if (M[i] < 0)
       continue; // ignore UNDEF indices
     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
       return false;
   }
 
   return true;
 }
 
 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   unsigned Idx = WhichResult * NumElts / 2;
   for (unsigned i = 0; i != NumElts; i += 2) {
     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
       return false;
     Idx += 1;
   }
 
   return true;
 }
 
 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i != NumElts; ++i) {
     if (M[i] < 0)
       continue; // ignore UNDEF indices
     if ((unsigned)M[i] != 2 * i + WhichResult)
       return false;
   }
 
   return true;
 }
 
 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i < NumElts; i += 2) {
     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
       return false;
   }
   return true;
 }
 
 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   unsigned Idx = WhichResult * NumElts / 2;
   for (unsigned i = 0; i != NumElts; i += 2) {
     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
       return false;
     Idx += 1;
   }
 
   return true;
 }
 
 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned Half = VT.getVectorNumElements() / 2;
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned j = 0; j != 2; ++j) {
     unsigned Idx = WhichResult;
     for (unsigned i = 0; i != Half; ++i) {
       int MIdx = M[i + j * Half];
       if (MIdx >= 0 && (unsigned)MIdx != Idx)
         return false;
       Idx += 2;
     }
   }
 
   return true;
 }
 
 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i < NumElts; i += 2) {
     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
       return false;
   }
   return true;
 }
 
 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
                       bool &DstIsLeft, int &Anomaly) {
   if (M.size() != static_cast<size_t>(NumInputElements))
     return false;
 
   int NumLHSMatch = 0, NumRHSMatch = 0;
   int LastLHSMismatch = -1, LastRHSMismatch = -1;
 
   for (int i = 0; i < NumInputElements; ++i) {
     if (M[i] == -1) {
       ++NumLHSMatch;
       ++NumRHSMatch;
       continue;
     }
 
     if (M[i] == i)
       ++NumLHSMatch;
     else
       LastLHSMismatch = i;
 
     if (M[i] == i + NumInputElements)
       ++NumRHSMatch;
     else
       LastRHSMismatch = i;
   }
 
   if (NumLHSMatch == NumInputElements - 1) {
     DstIsLeft = true;
     Anomaly = LastLHSMismatch;
     return true;
   } else if (NumRHSMatch == NumInputElements - 1) {
     DstIsLeft = false;
     Anomaly = LastRHSMismatch;
     return true;
   }
 
   return false;
 }
 
 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
   if (VT.getSizeInBits() != 128)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
 
   for (int I = 0, E = NumElts / 2; I != E; I++) {
     if (Mask[I] != I)
       return false;
   }
 
   int Offset = NumElts / 2;
   for (int I = NumElts / 2, E = NumElts; I != E; I++) {
     if (Mask[I] != I + SplitLHS * Offset)
       return false;
   }
 
   return true;
 }
 
 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue V0 = Op.getOperand(0);
   SDValue V1 = Op.getOperand(1);
   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
 
   if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
       VT.getVectorElementType() != V1.getValueType().getVectorElementType())
     return SDValue();
 
   bool SplitV0 = V0.getValueSizeInBits() == 128;
 
   if (!isConcatMask(Mask, VT, SplitV0))
     return SDValue();
 
   EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
                                 VT.getVectorNumElements() / 2);
   if (SplitV0) {
     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
                      DAG.getConstant(0, DL, MVT::i64));
   }
   if (V1.getValueSizeInBits() == 128) {
     V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
                      DAG.getConstant(0, DL, MVT::i64));
   }
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
 }
 
 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
                                       const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
 
   enum {
     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
     OP_VREV,
     OP_VDUP0,
     OP_VDUP1,
     OP_VDUP2,
     OP_VDUP3,
     OP_VEXT1,
     OP_VEXT2,
     OP_VEXT3,
     OP_VUZPL, // VUZP, left result
     OP_VUZPR, // VUZP, right result
     OP_VZIPL, // VZIP, left result
     OP_VZIPR, // VZIP, right result
     OP_VTRNL, // VTRN, left result
     OP_VTRNR  // VTRN, right result
   };
 
   if (OpNum == OP_COPY) {
     if (LHSID == (1 * 9 + 2) * 9 + 3)
       return LHS;
     assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
     return RHS;
   }
 
   SDValue OpLHS, OpRHS;
   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
   EVT VT = OpLHS.getValueType();
 
   switch (OpNum) {
   default:
     llvm_unreachable("Unknown shuffle opcode!");
   case OP_VREV:
     // VREV divides the vector in half and swaps within the half.
     if (VT.getVectorElementType() == MVT::i32 ||
         VT.getVectorElementType() == MVT::f32)
       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
     // vrev <4 x i16> -> REV32
     if (VT.getVectorElementType() == MVT::i16 ||
         VT.getVectorElementType() == MVT::f16)
       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
     // vrev <4 x i8> -> REV16
     assert(VT.getVectorElementType() == MVT::i8);
     return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
   case OP_VDUP0:
   case OP_VDUP1:
   case OP_VDUP2:
   case OP_VDUP3: {
     EVT EltTy = VT.getVectorElementType();
     unsigned Opcode;
     if (EltTy == MVT::i8)
       Opcode = AArch64ISD::DUPLANE8;
     else if (EltTy == MVT::i16 || EltTy == MVT::f16)
       Opcode = AArch64ISD::DUPLANE16;
     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
       Opcode = AArch64ISD::DUPLANE32;
     else if (EltTy == MVT::i64 || EltTy == MVT::f64)
       Opcode = AArch64ISD::DUPLANE64;
     else
       llvm_unreachable("Invalid vector element type?");
 
     if (VT.getSizeInBits() == 64)
       OpLHS = WidenVector(OpLHS, DAG);
     SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
     return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
   }
   case OP_VEXT1:
   case OP_VEXT2:
   case OP_VEXT3: {
     unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
     return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
                        DAG.getConstant(Imm, dl, MVT::i32));
   }
   case OP_VUZPL:
     return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   case OP_VUZPR:
     return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   case OP_VZIPL:
     return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   case OP_VZIPR:
     return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   case OP_VTRNL:
     return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   case OP_VTRNR:
     return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   }
 }
 
 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
                            SelectionDAG &DAG) {
   // Check to see if we can use the TBL instruction.
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   SDLoc DL(Op);
 
   EVT EltVT = Op.getValueType().getVectorElementType();
   unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
 
   SmallVector<SDValue, 8> TBLMask;
   for (int Val : ShuffleMask) {
     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
       unsigned Offset = Byte + Val * BytesPerElt;
       TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
     }
   }
 
   MVT IndexVT = MVT::v8i8;
   unsigned IndexLen = 8;
   if (Op.getValueSizeInBits() == 128) {
     IndexVT = MVT::v16i8;
     IndexLen = 16;
   }
 
   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
 
   SDValue Shuffle;
   if (V2.getNode()->isUndef()) {
     if (IndexLen == 8)
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
     Shuffle = DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
         DAG.getBuildVector(IndexVT, DL,
                            makeArrayRef(TBLMask.data(), IndexLen)));
   } else {
     if (IndexLen == 8) {
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
       Shuffle = DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
           DAG.getBuildVector(IndexVT, DL,
                              makeArrayRef(TBLMask.data(), IndexLen)));
     } else {
       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
       // cannot currently represent the register constraints on the input
       // table registers.
       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
       //                   DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
       //                   IndexLen));
       Shuffle = DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
           DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
           V2Cst, DAG.getBuildVector(IndexVT, DL,
                                     makeArrayRef(TBLMask.data(), IndexLen)));
     }
   }
   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
 }
 
 static unsigned getDUPLANEOp(EVT EltType) {
   if (EltType == MVT::i8)
     return AArch64ISD::DUPLANE8;
   if (EltType == MVT::i16 || EltType == MVT::f16)
     return AArch64ISD::DUPLANE16;
   if (EltType == MVT::i32 || EltType == MVT::f32)
     return AArch64ISD::DUPLANE32;
   if (EltType == MVT::i64 || EltType == MVT::f64)
     return AArch64ISD::DUPLANE64;
 
   llvm_unreachable("Invalid vector element type?");
 }
 
 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
 
   // Convert shuffles that are directly supported on NEON to target-specific
   // DAG nodes, instead of keeping them as shuffles and matching them again
   // during code selection.  This is more efficient and avoids the possibility
   // of inconsistencies between legalization and selection.
   ArrayRef<int> ShuffleMask = SVN->getMask();
 
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
 
   if (SVN->isSplat()) {
     int Lane = SVN->getSplatIndex();
     // If this is undef splat, generate it via "just" vdup, if possible.
     if (Lane == -1)
       Lane = 0;
 
     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
       return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
                          V1.getOperand(0));
     // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
     // constant. If so, we can just reference the lane's definition directly.
     if (V1.getOpcode() == ISD::BUILD_VECTOR &&
         !isa<ConstantSDNode>(V1.getOperand(Lane)))
       return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
 
     // Otherwise, duplicate from the lane of the input vector.
     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
 
     // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
     // to make a vector of the same size as this SHUFFLE. We can ignore the
     // extract entirely, and canonicalise the concat using WidenVector.
     if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
       Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
       V1 = V1.getOperand(0);
     } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
       unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
       Lane -= Idx * VT.getVectorNumElements() / 2;
       V1 = WidenVector(V1.getOperand(Idx), DAG);
     } else if (VT.getSizeInBits() == 64)
       V1 = WidenVector(V1, DAG);
 
     return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
   }
 
   if (isREVMask(ShuffleMask, VT, 64))
     return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
   if (isREVMask(ShuffleMask, VT, 32))
     return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
   if (isREVMask(ShuffleMask, VT, 16))
     return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
 
   bool ReverseEXT = false;
   unsigned Imm;
   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
     if (ReverseEXT)
       std::swap(V1, V2);
     Imm *= getExtFactor(V1);
     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
                        DAG.getConstant(Imm, dl, MVT::i32));
   } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
     Imm *= getExtFactor(V1);
     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
                        DAG.getConstant(Imm, dl, MVT::i32));
   }
 
   unsigned WhichResult;
   if (isZIPMask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   }
   if (isUZPMask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   }
   if (isTRNMask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   }
 
   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   }
   if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   }
   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   }
 
   if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
     return Concat;
 
   bool DstIsLeft;
   int Anomaly;
   int NumInputElements = V1.getValueType().getVectorNumElements();
   if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
     SDValue DstVec = DstIsLeft ? V1 : V2;
     SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
 
     SDValue SrcVec = V1;
     int SrcLane = ShuffleMask[Anomaly];
     if (SrcLane >= NumInputElements) {
       SrcVec = V2;
       SrcLane -= VT.getVectorNumElements();
     }
     SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
 
     EVT ScalarVT = VT.getVectorElementType();
 
     if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
       ScalarVT = MVT::i32;
 
     return DAG.getNode(
         ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
         DstLaneV);
   }
 
   // If the shuffle is not directly supported and it has 4 elements, use
   // the PerfectShuffle-generated table to synthesize it from other shuffles.
   unsigned NumElts = VT.getVectorNumElements();
   if (NumElts == 4) {
     unsigned PFIndexes[4];
     for (unsigned i = 0; i != 4; ++i) {
       if (ShuffleMask[i] < 0)
         PFIndexes[i] = 8;
       else
         PFIndexes[i] = ShuffleMask[i];
     }
 
     // Compute the index in the perfect shuffle table.
     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
                             PFIndexes[2] * 9 + PFIndexes[3];
     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
     unsigned Cost = (PFEntry >> 30);
 
     if (Cost <= 4)
       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   }
 
   return GenerateTBL(Op, ShuffleMask, DAG);
 }
 
 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
                                APInt &UndefBits) {
   EVT VT = BVN->getValueType(0);
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
 
     for (unsigned i = 0; i < NumSplats; ++i) {
       CnstBits <<= SplatBitSize;
       UndefBits <<= SplatBitSize;
       CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
       UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
     }
 
     return true;
   }
 
   return false;
 }
 
 // Try 64-bit splatted SIMD immediate.
 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
                                  const APInt &Bits) {
   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
     EVT VT = Op.getValueType();
     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
 
     if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
       Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
 
       SDLoc dl(Op);
       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
                                 DAG.getConstant(Value, dl, MVT::i32));
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
   }
 
   return SDValue();
 }
 
 // Try 32-bit splatted SIMD immediate.
 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
                                   const APInt &Bits,
                                   const SDValue *LHS = nullptr) {
   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
     EVT VT = Op.getValueType();
     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
     bool isAdvSIMDModImm = false;
     uint64_t Shift;
 
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
       Shift = 0;
     }
     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
       Shift = 8;
     }
     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
       Shift = 16;
     }
     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
       Shift = 24;
     }
 
     if (isAdvSIMDModImm) {
       SDLoc dl(Op);
       SDValue Mov;
 
       if (LHS)
         Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
                           DAG.getConstant(Value, dl, MVT::i32),
                           DAG.getConstant(Shift, dl, MVT::i32));
       else
         Mov = DAG.getNode(NewOp, dl, MovTy,
                           DAG.getConstant(Value, dl, MVT::i32),
                           DAG.getConstant(Shift, dl, MVT::i32));
 
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
   }
 
   return SDValue();
 }
 
 // Try 16-bit splatted SIMD immediate.
 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
                                   const APInt &Bits,
                                   const SDValue *LHS = nullptr) {
   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
     EVT VT = Op.getValueType();
     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
     bool isAdvSIMDModImm = false;
     uint64_t Shift;
 
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
       Shift = 0;
     }
     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
       Shift = 8;
     }
 
     if (isAdvSIMDModImm) {
       SDLoc dl(Op);
       SDValue Mov;
 
       if (LHS)
         Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
                           DAG.getConstant(Value, dl, MVT::i32),
                           DAG.getConstant(Shift, dl, MVT::i32));
       else
         Mov = DAG.getNode(NewOp, dl, MovTy,
                           DAG.getConstant(Value, dl, MVT::i32),
                           DAG.getConstant(Shift, dl, MVT::i32));
 
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
   }
 
   return SDValue();
 }
 
 // Try 32-bit splatted SIMD immediate with shifted ones.
 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
                                     SelectionDAG &DAG, const APInt &Bits) {
   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
     EVT VT = Op.getValueType();
     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
     bool isAdvSIMDModImm = false;
     uint64_t Shift;
 
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
       Shift = 264;
     }
     else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
       Shift = 272;
     }
 
     if (isAdvSIMDModImm) {
       SDLoc dl(Op);
       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
                                 DAG.getConstant(Value, dl, MVT::i32),
                                 DAG.getConstant(Shift, dl, MVT::i32));
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
   }
 
   return SDValue();
 }
 
 // Try 8-bit splatted SIMD immediate.
 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
                                  const APInt &Bits) {
   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
     EVT VT = Op.getValueType();
     MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
 
     if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
       Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
 
       SDLoc dl(Op);
       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
                                 DAG.getConstant(Value, dl, MVT::i32));
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
   }
 
   return SDValue();
 }
 
 // Try FP splatted SIMD immediate.
 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
                                   const APInt &Bits) {
   if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
     uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
     EVT VT = Op.getValueType();
     bool isWide = (VT.getSizeInBits() == 128);
     MVT MovTy;
     bool isAdvSIMDModImm = false;
 
     if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
       MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
     }
     else if (isWide &&
              (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
       Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
       MovTy = MVT::v2f64;
     }
 
     if (isAdvSIMDModImm) {
       SDLoc dl(Op);
       SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
                                 DAG.getConstant(Value, dl, MVT::i32));
       return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
     }
   }
 
   return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0);
   EVT VT = Op.getValueType();
 
   BuildVectorSDNode *BVN =
       dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
   if (!BVN) {
     // AND commutes, so try swapping the operands.
     LHS = Op.getOperand(1);
     BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
   }
   if (!BVN)
     return Op;
 
   APInt DefBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
     SDValue NewOp;
 
     // We only have BIC vector immediate instruction, which is and-not.
     DefBits = ~DefBits;
     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG,
                                     DefBits, &LHS)) ||
         (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG,
                                     DefBits, &LHS)))
       return NewOp;
 
     UndefBits = ~UndefBits;
     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, Op, DAG,
                                     UndefBits, &LHS)) ||
         (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, Op, DAG,
                                     UndefBits, &LHS)))
       return NewOp;
   }
 
   // We can always fall back to a non-immediate AND.
   return Op;
 }
 
 // Specialized code to quickly find if PotentialBVec is a BuildVector that
 // consists of only the same constant int value, returned in reference arg
 // ConstVal
 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
                                      uint64_t &ConstVal) {
   BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
   if (!Bvec)
     return false;
   ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
   if (!FirstElt)
     return false;
   EVT VT = Bvec->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
   for (unsigned i = 1; i < NumElts; ++i)
     if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
       return false;
   ConstVal = FirstElt->getZExtValue();
   return true;
 }
 
 static unsigned getIntrinsicID(const SDNode *N) {
   unsigned Opcode = N->getOpcode();
   switch (Opcode) {
   default:
     return Intrinsic::not_intrinsic;
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
     if (IID < Intrinsic::num_intrinsics)
       return IID;
     return Intrinsic::not_intrinsic;
   }
   }
 }
 
 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
 // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
 // Also, logical shift right -> sri, with the same structure.
 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
 
   if (!VT.isVector())
     return SDValue();
 
   SDLoc DL(N);
 
   // Is the first op an AND?
   const SDValue And = N->getOperand(0);
   if (And.getOpcode() != ISD::AND)
     return SDValue();
 
   // Is the second op an shl or lshr?
   SDValue Shift = N->getOperand(1);
   // This will have been turned into: AArch64ISD::VSHL vector, #shift
   // or AArch64ISD::VLSHR vector, #shift
   unsigned ShiftOpc = Shift.getOpcode();
   if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
     return SDValue();
   bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
 
   // Is the shift amount constant?
   ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
   if (!C2node)
     return SDValue();
 
   // Is the and mask vector all constant?
   uint64_t C1;
   if (!isAllConstantBuildVector(And.getOperand(1), C1))
     return SDValue();
 
   // Is C1 == ~C2, taking into account how much one can shift elements of a
   // particular size?
   uint64_t C2 = C2node->getZExtValue();
   unsigned ElemSizeInBits = VT.getScalarSizeInBits();
   if (C2 > ElemSizeInBits)
     return SDValue();
   unsigned ElemMask = (1 << ElemSizeInBits) - 1;
   if ((C1 & ElemMask) != (~C2 & ElemMask))
     return SDValue();
 
   SDValue X = And.getOperand(0);
   SDValue Y = Shift.getOperand(0);
 
   unsigned Intrin =
       IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
   SDValue ResultSLI =
       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                   DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
                   Shift.getOperand(1));
 
   LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
   LLVM_DEBUG(N->dump(&DAG));
   LLVM_DEBUG(dbgs() << "into: \n");
   LLVM_DEBUG(ResultSLI->dump(&DAG));
 
   ++NumShiftInserts;
   return ResultSLI;
 }
 
 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
   if (EnableAArch64SlrGeneration) {
     if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
       return Res;
   }
 
   EVT VT = Op.getValueType();
 
   SDValue LHS = Op.getOperand(0);
   BuildVectorSDNode *BVN =
       dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
   if (!BVN) {
     // OR commutes, so try swapping the operands.
     LHS = Op.getOperand(1);
     BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
   }
   if (!BVN)
     return Op;
 
   APInt DefBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
     SDValue NewOp;
 
     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
                                     DefBits, &LHS)) ||
         (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
                                     DefBits, &LHS)))
       return NewOp;
 
     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
                                     UndefBits, &LHS)) ||
         (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
                                     UndefBits, &LHS)))
       return NewOp;
   }
 
   // We can always fall back to a non-immediate OR.
   return Op;
 }
 
 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
 // be truncated to fit element width.
 static SDValue NormalizeBuildVector(SDValue Op,
                                     SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   EVT EltTy= VT.getVectorElementType();
 
   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
     return Op;
 
   SmallVector<SDValue, 16> Ops;
   for (SDValue Lane : Op->ops()) {
     if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
       APInt LowBits(EltTy.getSizeInBits(),
                     CstLane->getZExtValue());
       Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
     }
     Ops.push_back(Lane);
   }
   return DAG.getBuildVector(VT, dl, Ops);
 }
 
 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
 
   APInt DefBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
   if (resolveBuildVector(BVN, DefBits, UndefBits)) {
     SDValue NewOp;
     if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
       return NewOp;
 
     DefBits = ~DefBits;
     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
       return NewOp;
 
     DefBits = UndefBits;
     if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
       return NewOp;
 
     DefBits = ~UndefBits;
     if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
         (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
       return NewOp;
   }
 
   return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
   // Try to build a simple constant vector.
   Op = NormalizeBuildVector(Op, DAG);
   if (VT.isInteger()) {
     // Certain vector constants, used to express things like logical NOT and
     // arithmetic NEG, are passed through unmodified.  This allows special
     // patterns for these operations to match, which will lower these constants
     // to whatever is proven necessary.
     BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
     if (BVN->isConstant())
       if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
         unsigned BitSize = VT.getVectorElementType().getSizeInBits();
         APInt Val(BitSize,
                   Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
         if (Val.isNullValue() || Val.isAllOnesValue())
           return Op;
       }
   }
 
   if (SDValue V = ConstantBuildVector(Op, DAG))
     return V;
 
   // Scan through the operands to find some interesting properties we can
   // exploit:
   //   1) If only one value is used, we can use a DUP, or
   //   2) if only the low element is not undef, we can just insert that, or
   //   3) if only one constant value is used (w/ some non-constant lanes),
   //      we can splat the constant value into the whole vector then fill
   //      in the non-constant lanes.
   //   4) FIXME: If different constant values are used, but we can intelligently
   //             select the values we'll be overwriting for the non-constant
   //             lanes such that we can directly materialize the vector
   //             some other way (MOVI, e.g.), we can be sneaky.
   //   5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
   SDLoc dl(Op);
   unsigned NumElts = VT.getVectorNumElements();
   bool isOnlyLowElement = true;
   bool usesOnlyOneValue = true;
   bool usesOnlyOneConstantValue = true;
   bool isConstant = true;
   bool AllLanesExtractElt = true;
   unsigned NumConstantLanes = 0;
   SDValue Value;
   SDValue ConstantValue;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
       AllLanesExtractElt = false;
     if (V.isUndef())
       continue;
     if (i > 0)
       isOnlyLowElement = false;
     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
       isConstant = false;
 
     if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
       ++NumConstantLanes;
       if (!ConstantValue.getNode())
         ConstantValue = V;
       else if (ConstantValue != V)
         usesOnlyOneConstantValue = false;
     }
 
     if (!Value.getNode())
       Value = V;
     else if (V != Value)
       usesOnlyOneValue = false;
   }
 
   if (!Value.getNode()) {
     LLVM_DEBUG(
         dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
     return DAG.getUNDEF(VT);
   }
 
   if (isOnlyLowElement) {
     LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
                          "SCALAR_TO_VECTOR node\n");
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
   }
 
   if (AllLanesExtractElt) {
     SDNode *Vector = nullptr;
     bool Even = false;
     bool Odd = false;
     // Check whether the extract elements match the Even pattern <0,2,4,...> or
     // the Odd pattern <1,3,5,...>.
     for (unsigned i = 0; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       const SDNode *N = V.getNode();
       if (!isa<ConstantSDNode>(N->getOperand(1)))
         break;
       SDValue N0 = N->getOperand(0);
 
       // All elements are extracted from the same vector.
       if (!Vector) {
         Vector = N0.getNode();
         // Check that the type of EXTRACT_VECTOR_ELT matches the type of
         // BUILD_VECTOR.
         if (VT.getVectorElementType() !=
             N0.getValueType().getVectorElementType())
           break;
       } else if (Vector != N0.getNode()) {
         Odd = false;
         Even = false;
         break;
       }
 
       // Extracted values are either at Even indices <0,2,4,...> or at Odd
       // indices <1,3,5,...>.
       uint64_t Val = N->getConstantOperandVal(1);
       if (Val == 2 * i) {
         Even = true;
         continue;
       }
       if (Val - 1 == 2 * i) {
         Odd = true;
         continue;
       }
 
       // Something does not match: abort.
       Odd = false;
       Even = false;
       break;
     }
     if (Even || Odd) {
       SDValue LHS =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
                       DAG.getConstant(0, dl, MVT::i64));
       SDValue RHS =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
                       DAG.getConstant(NumElts, dl, MVT::i64));
 
       if (Even && !Odd)
         return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
                            RHS);
       if (Odd && !Even)
         return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
                            RHS);
     }
   }
 
   // Use DUP for non-constant splats. For f32 constant splats, reduce to
   // i32 and try again.
   if (usesOnlyOneValue) {
     if (!isConstant) {
       if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
           Value.getValueType() != VT) {
         LLVM_DEBUG(
             dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
         return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
       }
 
       // This is actually a DUPLANExx operation, which keeps everything vectory.
 
       SDValue Lane = Value.getOperand(1);
       Value = Value.getOperand(0);
       if (Value.getValueSizeInBits() == 64) {
         LLVM_DEBUG(
             dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
                       "widening it\n");
         Value = WidenVector(Value, DAG);
       }
 
       unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
       return DAG.getNode(Opcode, dl, VT, Value, Lane);
     }
 
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       EVT EltTy = VT.getVectorElementType();
       assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
               "Unsupported floating-point vector type");
       LLVM_DEBUG(
           dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
                     "BITCASTS, and try again\n");
       MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
       for (unsigned i = 0; i < NumElts; ++i)
         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
       LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
                  Val.dump(););
       Val = LowerBUILD_VECTOR(Val, DAG);
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
     }
   }
 
   // If there was only one constant value used and for more than one lane,
   // start by splatting that value, then replace the non-constant lanes. This
   // is better than the default, which will perform a separate initialization
   // for each lane.
   if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
     // Firstly, try to materialize the splat constant.
     SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
             Val = ConstantBuildVector(Vec, DAG);
     if (!Val) {
       // Otherwise, materialize the constant and splat it.
       Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
       DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
     }
 
     // Now insert the non-constant lanes.
     for (unsigned i = 0; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
       if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V))
         // Note that type legalization likely mucked about with the VT of the
         // source operand, so we may have to convert it here before inserting.
         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
     }
     return Val;
   }
 
   // This will generate a load from the constant pool.
   if (isConstant) {
     LLVM_DEBUG(
         dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
                   "expansion\n");
     return SDValue();
   }
 
   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
   if (NumElts >= 4) {
     if (SDValue shuffle = ReconstructShuffle(Op, DAG))
       return shuffle;
   }
 
   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
   // know the default expansion would otherwise fall back on something even
   // worse. For a vector with one or two non-undef values, that's
   // scalar_to_vector for the elements followed by a shuffle (provided the
   // shuffle is valid for the target) and materialization element by element
   // on the stack followed by a load for everything else.
   if (!isConstant && !usesOnlyOneValue) {
     LLVM_DEBUG(
         dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
                   "of INSERT_VECTOR_ELT\n");
 
     SDValue Vec = DAG.getUNDEF(VT);
     SDValue Op0 = Op.getOperand(0);
     unsigned i = 0;
 
     // Use SCALAR_TO_VECTOR for lane zero to
     // a) Avoid a RMW dependency on the full vector register, and
     // b) Allow the register coalescer to fold away the copy if the
     //    value is already in an S or D register, and we're forced to emit an
     //    INSERT_SUBREG that we can't fold anywhere.
     //
     // We also allow types like i8 and i16 which are illegal scalar but legal
     // vector element types. After type-legalization the inserted value is
     // extended (i32) and it is safe to cast them to the vector type by ignoring
     // the upper bits of the lowest lane (e.g. v8i8, v4i16).
     if (!Op0.isUndef()) {
       LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
       Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
       ++i;
     }
     LLVM_DEBUG(if (i < NumElts) dbgs()
                    << "Creating nodes for the other vector elements:\n";);
     for (; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       if (V.isUndef())
         continue;
       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
     }
     return Vec;
   }
 
   LLVM_DEBUG(
       dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
                 "better alternative\n");
   return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                       SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
 
   // Check for non-constant or out of range lane.
   EVT VT = Op.getOperand(0).getValueType();
   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
     return SDValue();
 
 
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
       VT == MVT::v8f16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
     return SDValue();
 
   // For V64 types, we perform insertion by expanding the value
   // to a V128 type and perform the insertion on that.
   SDLoc DL(Op);
   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
   EVT WideTy = WideVec.getValueType();
 
   SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
                              Op.getOperand(1), Op.getOperand(2));
   // Re-narrow the resultant vector.
   return NarrowVector(Node, DAG);
 }
 
 SDValue
 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
 
   // Check for non-constant or out of range lane.
   EVT VT = Op.getOperand(0).getValueType();
   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
     return SDValue();
 
 
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
       VT == MVT::v8f16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
     return SDValue();
 
   // For V64 types, we perform extraction by expanding the value
   // to a V128 type and perform the extraction on that.
   SDLoc DL(Op);
   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
   EVT WideTy = WideVec.getValueType();
 
   EVT ExtrTy = WideTy.getVectorElementType();
   if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
     ExtrTy = MVT::i32;
 
   // For extractions, we just return the result directly.
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
                      Op.getOperand(1));
 }
 
 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                                       SelectionDAG &DAG) const {
   EVT VT = Op.getOperand(0).getValueType();
   SDLoc dl(Op);
   // Just in case...
   if (!VT.isVector())
     return SDValue();
 
   ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   if (!Cst)
     return SDValue();
   unsigned Val = Cst->getZExtValue();
 
   unsigned Size = Op.getValueSizeInBits();
 
   // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
   if (Val == 0)
     return Op;
 
   // If this is extracting the upper 64-bits of a 128-bit vector, we match
   // that directly.
   if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
     return Op;
 
   return SDValue();
 }
 
 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   if (VT.getVectorNumElements() == 4 &&
       (VT.is128BitVector() || VT.is64BitVector())) {
     unsigned PFIndexes[4];
     for (unsigned i = 0; i != 4; ++i) {
       if (M[i] < 0)
         PFIndexes[i] = 8;
       else
         PFIndexes[i] = M[i];
     }
 
     // Compute the index in the perfect shuffle table.
     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
                             PFIndexes[2] * 9 + PFIndexes[3];
     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
     unsigned Cost = (PFEntry >> 30);
 
     if (Cost <= 4)
       return true;
   }
 
   bool DummyBool;
   int DummyInt;
   unsigned DummyUnsigned;
 
   return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
           isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
           // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
           isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
           isZIPMask(M, VT, DummyUnsigned) ||
           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
           isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
           isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
           isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
           isConcatMask(M, VT, VT.getSizeInBits() == 128));
 }
 
 /// getVShiftImm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift operation, where all the elements of the
 /// build_vector must have the same constant integer value.
 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
   // Ignore bit_converts.
   while (Op.getOpcode() == ISD::BITCAST)
     Op = Op.getOperand(0);
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
                                     HasAnyUndefs, ElementBits) ||
       SplatBitSize > ElementBits)
     return false;
   Cnt = SplatBits.getSExtValue();
   return true;
 }
 
 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift left operation.  That value must be in the range:
 ///   0 <= Value < ElementBits for a left shift; or
 ///   0 <= Value <= ElementBits for a long left shift.
 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
   int64_t ElementBits = VT.getScalarSizeInBits();
   if (!getVShiftImm(Op, ElementBits, Cnt))
     return false;
   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
 }
 
 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift right operation. The value must be in the range:
 ///   1 <= Value <= ElementBits for a right shift; or
 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
   int64_t ElementBits = VT.getScalarSizeInBits();
   if (!getVShiftImm(Op, ElementBits, Cnt))
     return false;
   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
 }
 
 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
                                                       SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   int64_t Cnt;
 
   if (!Op.getOperand(1).getValueType().isVector())
     return Op;
   unsigned EltSize = VT.getScalarSizeInBits();
 
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("unexpected shift opcode");
 
   case ISD::SHL:
     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
                          DAG.getConstant(Cnt, DL, MVT::i32));
     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                        DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
                                        MVT::i32),
                        Op.getOperand(0), Op.getOperand(1));
   case ISD::SRA:
   case ISD::SRL:
     // Right shift immediate
     if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
       unsigned Opc =
           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
       return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
                          DAG.getConstant(Cnt, DL, MVT::i32));
     }
 
     // Right shift register.  Note, there is not a shift right register
     // instruction, but the shift left register instruction takes a signed
     // value, where negative numbers specify a right shift.
     unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
                                                 : Intrinsic::aarch64_neon_ushl;
     // negate the shift amount
     SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
     SDValue NegShiftLeft =
         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                     DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
                     NegShift);
     return NegShiftLeft;
   }
 
   return SDValue();
 }
 
 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
                                     const SDLoc &dl, SelectionDAG &DAG) {
   EVT SrcVT = LHS.getValueType();
   assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
          "function only supposed to emit natural comparisons");
 
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
   APInt CnstBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
   bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
   bool IsZero = IsCnst && (CnstBits == 0);
 
   if (SrcVT.getVectorElementType().isFloatingPoint()) {
     switch (CC) {
     default:
       return SDValue();
     case AArch64CC::NE: {
       SDValue Fcmeq;
       if (IsZero)
         Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
       else
         Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
       return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
     }
     case AArch64CC::EQ:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
     case AArch64CC::GE:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
     case AArch64CC::GT:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
     case AArch64CC::LS:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
     case AArch64CC::LT:
       if (!NoNans)
         return SDValue();
       // If we ignore NaNs then we can use to the MI implementation.
       LLVM_FALLTHROUGH;
     case AArch64CC::MI:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
     }
   }
 
   switch (CC) {
   default:
     return SDValue();
   case AArch64CC::NE: {
     SDValue Cmeq;
     if (IsZero)
       Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
     else
       Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
     return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
   }
   case AArch64CC::EQ:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
   case AArch64CC::GE:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
   case AArch64CC::GT:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
   case AArch64CC::LE:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
   case AArch64CC::LS:
     return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
   case AArch64CC::LO:
     return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
   case AArch64CC::LT:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
   case AArch64CC::HI:
     return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
   case AArch64CC::HS:
     return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
   }
 }
 
 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
                                            SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
   SDLoc dl(Op);
 
   if (LHS.getValueType().getVectorElementType().isInteger()) {
     assert(LHS.getValueType() == RHS.getValueType());
     AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
     SDValue Cmp =
         EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
   }
 
   const bool FullFP16 =
     static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasFullFP16();
 
   // Make v4f16 (only) fcmp operations utilise vector instructions
   // v8f16 support will be a litle more complicated
   if (LHS.getValueType().getVectorElementType() == MVT::f16) {
     if (!FullFP16 && LHS.getValueType().getVectorNumElements() == 4) {
       LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
       RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
       SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
       DAG.ReplaceAllUsesWith(Op, NewSetcc);
       CmpVT = MVT::v4i32;
     } else
       return SDValue();
   }
 
   assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
          LHS.getValueType().getVectorElementType() == MVT::f64);
 
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   // clean.  Some of them require two branches to implement.
   AArch64CC::CondCode CC1, CC2;
   bool ShouldInvert;
   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
 
   bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
   SDValue Cmp =
       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
   if (!Cmp.getNode())
     return SDValue();
 
   if (CC2 != AArch64CC::AL) {
     SDValue Cmp2 =
         EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
     if (!Cmp2.getNode())
       return SDValue();
 
     Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
   }
 
   Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
 
   if (ShouldInvert)
     return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
 
   return Cmp;
 }
 
 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
                                   SelectionDAG &DAG) {
   SDValue VecOp = ScalarOp.getOperand(0);
   auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
                      DAG.getConstant(0, DL, MVT::i64));
 }
 
 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc dl(Op);
   switch (Op.getOpcode()) {
   case ISD::VECREDUCE_ADD:
     return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
   case ISD::VECREDUCE_SMAX:
     return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
   case ISD::VECREDUCE_SMIN:
     return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
   case ISD::VECREDUCE_UMAX:
     return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
   case ISD::VECREDUCE_UMIN:
     return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
   case ISD::VECREDUCE_FMAX: {
     assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
     return DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
         DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
         Op.getOperand(0));
   }
   case ISD::VECREDUCE_FMIN: {
     assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
     return DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
         DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
         Op.getOperand(0));
   }
   default:
     llvm_unreachable("Unhandled reduction");
   }
 }
 
 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
                                                     SelectionDAG &DAG) const {
   auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
   if (!Subtarget.hasLSE())
     return SDValue();
 
   // LSE has an atomic load-add instruction, but not a load-sub.
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue RHS = Op.getOperand(2);
   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
   RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
   return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
                        Op.getOperand(0), Op.getOperand(1), RHS,
                        AN->getMemOperand());
 }
 
 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
                                                     SelectionDAG &DAG) const {
   auto &Subtarget = static_cast<const AArch64Subtarget &>(DAG.getSubtarget());
   if (!Subtarget.hasLSE())
     return SDValue();
 
   // LSE has an atomic load-clear instruction, but not a load-and.
   SDLoc dl(Op);
   MVT VT = Op.getSimpleValueType();
   SDValue RHS = Op.getOperand(2);
   AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
   RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
   return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
                        Op.getOperand(0), Op.getOperand(1), RHS,
                        AN->getMemOperand());
 }
 
 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
     SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
 
   const uint32_t *Mask =
       Subtarget->getRegisterInfo()->getWindowsStackProbePreservedMask();
 
   Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
                      DAG.getConstant(4, dl, MVT::i64));
   Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
   Chain =
       DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
                   Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
                   DAG.getRegisterMask(Mask), Chain.getValue(1));
   // To match the actual intent better, we should read the output from X15 here
   // again (instead of potentially spilling it to the stack), but rereading Size
   // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
   // here.
 
   Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
                      DAG.getConstant(4, dl, MVT::i64));
   return Chain;
 }
 
 SDValue
 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                SelectionDAG &DAG) const {
   assert(Subtarget->isTargetWindows() &&
          "Only Windows alloca probing supported");
   SDLoc dl(Op);
   // Get the inputs.
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
   unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   EVT VT = Node->getValueType(0);
 
   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
           "no-stack-arg-probe")) {
     SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
     Chain = SP.getValue(1);
     SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
     if (Align)
       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
                        DAG.getConstant(-(uint64_t)Align, dl, VT));
     Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
     SDValue Ops[2] = {SP, Chain};
     return DAG.getMergeValues(Ops, dl);
   }
 
   Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
 
   Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
 
   SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
   Chain = SP.getValue(1);
   SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
   if (Align)
     SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
                      DAG.getConstant(-(uint64_t)Align, dl, VT));
   Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
                              DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
 
   SDValue Ops[2] = {SP, Chain};
   return DAG.getMergeValues(Ops, dl);
 }
 
 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
 /// specified in the intrinsic calls.
 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                const CallInst &I,
                                                MachineFunction &MF,
                                                unsigned Intrinsic) const {
   auto &DL = I.getModule()->getDataLayout();
   switch (Intrinsic) {
   case Intrinsic::aarch64_neon_ld2:
   case Intrinsic::aarch64_neon_ld3:
   case Intrinsic::aarch64_neon_ld4:
   case Intrinsic::aarch64_neon_ld1x2:
   case Intrinsic::aarch64_neon_ld1x3:
   case Intrinsic::aarch64_neon_ld1x4:
   case Intrinsic::aarch64_neon_ld2lane:
   case Intrinsic::aarch64_neon_ld3lane:
   case Intrinsic::aarch64_neon_ld4lane:
   case Intrinsic::aarch64_neon_ld2r:
   case Intrinsic::aarch64_neon_ld3r:
   case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
     Info.align = 0;
     // volatile loads with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
   case Intrinsic::aarch64_neon_st2:
   case Intrinsic::aarch64_neon_st3:
   case Intrinsic::aarch64_neon_st4:
   case Intrinsic::aarch64_neon_st1x2:
   case Intrinsic::aarch64_neon_st1x3:
   case Intrinsic::aarch64_neon_st1x4:
   case Intrinsic::aarch64_neon_st2lane:
   case Intrinsic::aarch64_neon_st3lane:
   case Intrinsic::aarch64_neon_st4lane: {
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
     Info.align = 0;
     // volatile stores with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
   case Intrinsic::aarch64_ldaxr:
   case Intrinsic::aarch64_ldxr: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::aarch64_stlxr:
   case Intrinsic::aarch64_stxr: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::aarch64_ldaxp:
   case Intrinsic::aarch64_ldxp:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.align = 16;
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   case Intrinsic::aarch64_stlxp:
   case Intrinsic::aarch64_stxp:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(2);
     Info.offset = 0;
     Info.align = 16;
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   default:
     break;
   }
 
   return false;
 }
 
 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
                                                   ISD::LoadExtType ExtTy,
                                                   EVT NewVT) const {
   // If we're reducing the load width in order to avoid having to use an extra
   // instruction to do extension then it's probably a good idea.
   if (ExtTy != ISD::NON_EXTLOAD)
     return true;
   // Don't reduce load width if it would prevent us from combining a shift into
   // the offset.
   MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
   assert(Mem);
   const SDValue &Base = Mem->getBasePtr();
   if (Base.getOpcode() == ISD::ADD &&
       Base.getOperand(1).getOpcode() == ISD::SHL &&
       Base.getOperand(1).hasOneUse() &&
       Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
     // The shift can be combined if it matches the size of the value being
     // loaded (and so reducing the width would make it not match).
     uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
     uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
     if (ShiftAmount == Log2_32(LoadBytes))
       return false;
   }
   // We have no reason to disallow reducing the load width, so allow it.
   return true;
 }
 
 // Truncations from 64-bit GPR to 32-bit GPR is free.
 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   return NumBits1 > NumBits2;
 }
 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
   return NumBits1 > NumBits2;
 }
 
 /// Check if it is profitable to hoist instruction in then/else to if.
 /// Not profitable if I and it's user can form a FMA instruction
 /// because we prefer FMSUB/FMADD.
 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
   if (I->getOpcode() != Instruction::FMul)
     return true;
 
   if (!I->hasOneUse())
     return true;
 
   Instruction *User = I->user_back();
 
   if (User &&
       !(User->getOpcode() == Instruction::FSub ||
         User->getOpcode() == Instruction::FAdd))
     return true;
 
   const TargetOptions &Options = getTargetMachine().Options;
   const DataLayout &DL = I->getModule()->getDataLayout();
   EVT VT = getValueType(DL, User->getOperand(0)->getType());
 
   return !(isFMAFasterThanFMulAndFAdd(VT) &&
            isOperationLegalOrCustom(ISD::FMA, VT) &&
            (Options.AllowFPOpFusion == FPOpFusion::Fast ||
             Options.UnsafeFPMath));
 }
 
 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
 // 64-bit GPR.
 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   return NumBits1 == 32 && NumBits2 == 64;
 }
 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
   return NumBits1 == 32 && NumBits2 == 64;
 }
 
 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   EVT VT1 = Val.getValueType();
   if (isZExtFree(VT1, VT2)) {
     return true;
   }
 
   if (Val.getOpcode() != ISD::LOAD)
     return false;
 
   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
   return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
           VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
           VT1.getSizeInBits() <= 32);
 }
 
 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
   if (isa<FPExtInst>(Ext))
     return false;
 
   // Vector types are not free.
   if (Ext->getType()->isVectorTy())
     return false;
 
   for (const Use &U : Ext->uses()) {
     // The extension is free if we can fold it with a left shift in an
     // addressing mode or an arithmetic operation: add, sub, and cmp.
 
     // Is there a shift?
     const Instruction *Instr = cast<Instruction>(U.getUser());
 
     // Is this a constant shift?
     switch (Instr->getOpcode()) {
     case Instruction::Shl:
       if (!isa<ConstantInt>(Instr->getOperand(1)))
         return false;
       break;
     case Instruction::GetElementPtr: {
       gep_type_iterator GTI = gep_type_begin(Instr);
       auto &DL = Ext->getModule()->getDataLayout();
       std::advance(GTI, U.getOperandNo()-1);
       Type *IdxTy = GTI.getIndexedType();
       // This extension will end up with a shift because of the scaling factor.
       // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
       // Get the shift amount based on the scaling factor:
       // log2(sizeof(IdxTy)) - log2(8).
       uint64_t ShiftAmt =
           countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
       // Is the constant foldable in the shift of the addressing mode?
       // I.e., shift amount is between 1 and 4 inclusive.
       if (ShiftAmt == 0 || ShiftAmt > 4)
         return false;
       break;
     }
     case Instruction::Trunc:
       // Check if this is a noop.
       // trunc(sext ty1 to ty2) to ty1.
       if (Instr->getType() == Ext->getOperand(0)->getType())
         continue;
       LLVM_FALLTHROUGH;
     default:
       return false;
     }
 
     // At this point we can use the bfm family, so this extension is free
     // for that use.
   }
   return true;
 }
 
 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
                                           unsigned &RequiredAligment) const {
   if (!LoadedType.isSimple() ||
       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
     return false;
   // Cyclone supports unaligned accesses.
   RequiredAligment = 0;
   unsigned NumBits = LoadedType.getSizeInBits();
   return NumBits == 32 || NumBits == 64;
 }
 
 /// A helper function for determining the number of interleaved accesses we
 /// will generate when lowering accesses of the given type.
 unsigned
 AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
                                                  const DataLayout &DL) const {
   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
 }
 
 MachineMemOperand::Flags
 AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
   if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
       I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
     return MOStridedAccess;
   return MachineMemOperand::MONone;
 }
 
 bool AArch64TargetLowering::isLegalInterleavedAccessType(
     VectorType *VecTy, const DataLayout &DL) const {
 
   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
 
   // Ensure the number of vector elements is greater than 1.
   if (VecTy->getNumElements() < 2)
     return false;
 
   // Ensure the element type is legal.
   if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
     return false;
 
   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
   // 128 will be split into multiple interleaved accesses.
   return VecSize == 64 || VecSize % 128 == 0;
 }
 
 /// Lower an interleaved load into a ldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
 ///
 ///      Into:
 ///        %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
 bool AArch64TargetLowering::lowerInterleavedLoad(
     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
   VectorType *VecTy = Shuffles[0]->getType();
 
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
     return false;
 
   unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
 
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   Type *EltTy = VecTy->getVectorElementType();
   if (EltTy->isPointerTy())
     VecTy =
         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
   IRBuilder<> Builder(LI);
 
   // The base address of the load.
   Value *BaseAddr = LI->getPointerOperand();
 
   if (NumLoads > 1) {
     // If we're going to generate more than one load, reset the sub-vector type
     // to something legal.
     VecTy = VectorType::get(VecTy->getVectorElementType(),
                             VecTy->getVectorNumElements() / NumLoads);
 
     // We will compute the pointer operand of each load from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
         BaseAddr, VecTy->getVectorElementType()->getPointerTo(
                       LI->getPointerAddressSpace()));
   }
 
   Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
   Type *Tys[2] = {VecTy, PtrTy};
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
                                             Intrinsic::aarch64_neon_ld3,
                                             Intrinsic::aarch64_neon_ld4};
   Function *LdNFunc =
       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
 
   // Holds sub-vectors extracted from the load intrinsic return values. The
   // sub-vectors are associated with the shufflevector instructions they will
   // replace.
   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
 
   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
 
     // If we're generating more than one load, compute the base address of
     // subsequent loads as an offset from the previous.
     if (LoadCount > 0)
       BaseAddr = Builder.CreateConstGEP1_32(
           BaseAddr, VecTy->getVectorNumElements() * Factor);
 
     CallInst *LdN = Builder.CreateCall(
         LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
 
     // Extract and store the sub-vectors returned by the load intrinsic.
     for (unsigned i = 0; i < Shuffles.size(); i++) {
       ShuffleVectorInst *SVI = Shuffles[i];
       unsigned Index = Indices[i];
 
       Value *SubVec = Builder.CreateExtractValue(LdN, Index);
 
       // Convert the integer vector to pointer vector if the element is pointer.
       if (EltTy->isPointerTy())
         SubVec = Builder.CreateIntToPtr(
             SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
                                     VecTy->getVectorNumElements()));
       SubVecs[SVI].push_back(SubVec);
     }
   }
 
   // Replace uses of the shufflevector instructions with the sub-vectors
   // returned by the load intrinsic. If a shufflevector instruction is
   // associated with more than one sub-vector, those sub-vectors will be
   // concatenated into a single wide vector.
   for (ShuffleVectorInst *SVI : Shuffles) {
     auto &SubVec = SubVecs[SVI];
     auto *WideVec =
         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
     SVI->replaceAllUsesWith(WideVec);
   }
 
   return true;
 }
 
 /// Lower an interleaved store into a stN intrinsic.
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
 ///                 <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
 ///
 ///      Into:
 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
 ///
 /// Note that the new shufflevectors will be removed and we'll only generate one
 /// st3 instruction in CodeGen.
 ///
 /// Example for a more general valid mask (Factor 3). Lower:
 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
 ///
 ///      Into:
 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
 ///        call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
                                                   ShuffleVectorInst *SVI,
                                                   unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
   VectorType *VecTy = SVI->getType();
   assert(VecTy->getVectorNumElements() % Factor == 0 &&
          "Invalid interleaved store");
 
   unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
   Type *EltTy = VecTy->getVectorElementType();
   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
 
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
     return false;
 
   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
 
   Value *Op0 = SVI->getOperand(0);
   Value *Op1 = SVI->getOperand(1);
   IRBuilder<> Builder(SI);
 
   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
   // vectors to integer vectors.
   if (EltTy->isPointerTy()) {
     Type *IntTy = DL.getIntPtrType(EltTy);
     unsigned NumOpElts = Op0->getType()->getVectorNumElements();
 
     // Convert to the corresponding integer vector.
     Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
 
     SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
   // The base address of the store.
   Value *BaseAddr = SI->getPointerOperand();
 
   if (NumStores > 1) {
     // If we're going to generate more than one store, reset the lane length
     // and sub-vector type to something legal.
     LaneLen /= NumStores;
     SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
 
     // We will compute the pointer operand of each store from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
         BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
                       SI->getPointerAddressSpace()));
   }
 
   auto Mask = SVI->getShuffleMask();
 
   Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
   Type *Tys[2] = {SubVecTy, PtrTy};
   static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
                                              Intrinsic::aarch64_neon_st3,
                                              Intrinsic::aarch64_neon_st4};
   Function *StNFunc =
       Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
 
   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
 
     SmallVector<Value *, 5> Ops;
 
     // Split the shufflevector operands into sub vectors for the new stN call.
     for (unsigned i = 0; i < Factor; i++) {
       unsigned IdxI = StoreCount * LaneLen * Factor + i;
       if (Mask[IdxI] >= 0) {
         Ops.push_back(Builder.CreateShuffleVector(
             Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
       } else {
         unsigned StartMask = 0;
         for (unsigned j = 1; j < LaneLen; j++) {
           unsigned IdxJ = StoreCount * LaneLen * Factor + j;
           if (Mask[IdxJ * Factor + IdxI] >= 0) {
             StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
             break;
           }
         }
         // Note: Filling undef gaps with random elements is ok, since
         // those elements were being written anyway (with undefs).
         // In the case of all undefs we're defaulting to using elems from 0
         // Note: StartMask cannot be negative, it's checked in
         // isReInterleaveMask
         Ops.push_back(Builder.CreateShuffleVector(
             Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
       }
     }
 
     // If we generating more than one store, we compute the base address of
     // subsequent stores as an offset from the previous.
     if (StoreCount > 0)
       BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
 
     Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
     Builder.CreateCall(StNFunc, Ops);
   }
   return true;
 }
 
 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
                        unsigned AlignCheck) {
   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
           (DstAlign == 0 || DstAlign % AlignCheck == 0));
 }
 
 EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
                                                unsigned SrcAlign, bool IsMemset,
                                                bool ZeroMemset,
                                                bool MemcpyStrSrc,
                                                MachineFunction &MF) const {
   // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
   // instruction to materialize the v2i64 zero and one store (with restrictive
   // addressing mode). Just do two i64 store of zero-registers.
   bool Fast;
   const Function &F = MF.getFunction();
   if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
       !F.hasFnAttribute(Attribute::NoImplicitFloat) &&
       (memOpAlign(SrcAlign, DstAlign, 16) ||
        (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
     return MVT::f128;
 
   if (Size >= 8 &&
       (memOpAlign(SrcAlign, DstAlign, 8) ||
        (allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast)))
     return MVT::i64;
 
   if (Size >= 4 &&
       (memOpAlign(SrcAlign, DstAlign, 4) ||
        (allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast)))
     return MVT::i32;
 
   return MVT::Other;
 }
 
 // 12-bit optionally shifted immediates are legal for adds.
 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
   if (Immed == std::numeric_limits<int64_t>::min()) {
     LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
                       << ": avoid UB for INT64_MIN\n");
     return false;
   }
   // Same encoding for add/sub, just flip the sign.
   Immed = std::abs(Immed);
   bool IsLegal = ((Immed >> 12) == 0 ||
                   ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
   LLVM_DEBUG(dbgs() << "Is " << Immed
                     << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
   return IsLegal;
 }
 
 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
 // immediates is the same as for an add or a sub.
 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
   return isLegalAddImmediate(Immed);
 }
 
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                                   const AddrMode &AM, Type *Ty,
                                                   unsigned AS, Instruction *I) const {
   // AArch64 has five basic addressing modes:
   //  reg
   //  reg + 9-bit signed offset
   //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
   //  reg1 + reg2
   //  reg + SIZE_IN_BYTES * reg
 
   // No global is ever allowed as a base.
   if (AM.BaseGV)
     return false;
 
   // No reg+reg+imm addressing.
   if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
     return false;
 
   // check reg + imm case:
   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
   uint64_t NumBytes = 0;
   if (Ty->isSized()) {
     uint64_t NumBits = DL.getTypeSizeInBits(Ty);
     NumBytes = NumBits / 8;
     if (!isPowerOf2_64(NumBits))
       NumBytes = 0;
   }
 
   if (!AM.Scale) {
     int64_t Offset = AM.BaseOffs;
 
     // 9-bit signed offset
     if (isInt<9>(Offset))
       return true;
 
     // 12-bit unsigned offset
     unsigned shift = Log2_64(NumBytes);
     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
         // Must be a multiple of NumBytes (NumBytes is a power of 2)
         (Offset >> shift) << shift == Offset)
       return true;
     return false;
   }
 
   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
 
   return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
 }
 
 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
   // Consider splitting large offset of struct or array.
   return true;
 }
 
 int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
                                                 const AddrMode &AM, Type *Ty,
                                                 unsigned AS) const {
   // Scaling factors are not free at all.
   // Operands                     | Rt Latency
   // -------------------------------------------
   // Rt, [Xn, Xm]                 | 4
   // -------------------------------------------
   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
   // Rt, [Xn, Wm, <extend> #imm]  |
   if (isLegalAddressingMode(DL, AM, Ty, AS))
     // Scale represents reg2 * scale, thus account for 1 if
     // it is not equal to 0 or 1.
     return AM.Scale != 0 && AM.Scale != 1;
   return -1;
 }
 
 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   VT = VT.getScalarType();
 
   if (!VT.isSimple())
     return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f32:
   case MVT::f64:
     return true;
   default:
     break;
   }
 
   return false;
 }
 
 const MCPhysReg *
 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
   // LR is a callee-save register, but we must treat it as clobbered by any call
   // site. Hence we include LR in the scratch registers, which are in turn added
   // as implicit-defs for stackmaps and patchpoints.
   static const MCPhysReg ScratchRegs[] = {
     AArch64::X16, AArch64::X17, AArch64::LR, 0
   };
   return ScratchRegs;
 }
 
 bool
-AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
+AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
+                                                     CombineLevel Level) const {
+  N = N->getOperand(0).getNode();
   EVT VT = N->getValueType(0);
     // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
     // it with shift to let it be lowered to UBFX.
   if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
       isa<ConstantSDNode>(N->getOperand(1))) {
     uint64_t TruncMask = N->getConstantOperandVal(1);
     if (isMask_64(TruncMask) &&
       N->getOperand(0).getOpcode() == ISD::SRL &&
       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
       return false;
   }
   return true;
 }
 
 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                               Type *Ty) const {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   if (BitSize == 0)
     return false;
 
   int64_t Val = Imm.getSExtValue();
   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
     return true;
 
   if ((int64_t)Val < 0)
     Val = ~Val;
   if (BitSize == 32)
     Val &= (1LL << 32) - 1;
 
   unsigned LZ = countLeadingZeros((uint64_t)Val);
   unsigned Shift = (63 - LZ) / 16;
   // MOVZ is free so return true for one or fewer MOVK.
   return Shift < 3;
 }
 
 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                                     unsigned Index) const {
   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
     return false;
 
   return (Index == 0 || Index == ResVT.getVectorNumElements());
 }
 
 /// Turn vector tests of the signbit in the form of:
 ///   xor (sra X, elt_size(X)-1), -1
 /// into:
 ///   cmge X, X, #0
 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
                                          const AArch64Subtarget *Subtarget) {
   EVT VT = N->getValueType(0);
   if (!Subtarget->hasNEON() || !VT.isVector())
     return SDValue();
 
   // There must be a shift right algebraic before the xor, and the xor must be a
   // 'not' operation.
   SDValue Shift = N->getOperand(0);
   SDValue Ones = N->getOperand(1);
   if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
       !ISD::isBuildVectorAllOnes(Ones.getNode()))
     return SDValue();
 
   // The shift should be smearing the sign bit across each vector element.
   auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
   EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
   if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
     return SDValue();
 
   return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
 }
 
 // Generate SUBS and CSEL for integer abs.
 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDLoc DL(N);
 
   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
   // and change it to SUB and CSEL.
   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
       N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
       N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
       if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
         SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
                                   N0.getOperand(0));
         // Generate SUBS & CSEL.
         SDValue Cmp =
             DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
                         N0.getOperand(0), DAG.getConstant(0, DL, VT));
         return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
                            DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
                            SDValue(Cmp.getNode(), 1));
       }
   return SDValue();
 }
 
 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
     return Cmp;
 
   return performIntegerAbsCombine(N, DAG);
 }
 
 SDValue
 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                      SelectionDAG &DAG,
                                      SmallVectorImpl<SDNode *> &Created) const {
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
   if (isIntDivCheap(N->getValueType(0), Attr))
     return SDValue(N,0); // Lower SDIV as SDIV
 
   // fold (sdiv X, pow2)
   EVT VT = N->getValueType(0);
   if ((VT != MVT::i32 && VT != MVT::i64) ||
       !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
     return SDValue();
 
   SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
   unsigned Lg2 = Divisor.countTrailingZeros();
   SDValue Zero = DAG.getConstant(0, DL, VT);
   SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
 
   // Add (N0 < 0) ? Pow2 - 1 : 0;
   SDValue CCVal;
   SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
   SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
 
   Created.push_back(Cmp.getNode());
   Created.push_back(Add.getNode());
   Created.push_back(CSel.getNode());
 
   // Divide by pow2.
   SDValue SRA =
       DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
 
   // If we're dividing by a positive value, we're done.  Otherwise, we must
   // negate the result.
   if (Divisor.isNonNegative())
     return SRA;
 
   Created.push_back(SRA.getNode());
   return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
 }
 
 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   // The below optimizations require a constant RHS.
   if (!isa<ConstantSDNode>(N->getOperand(1)))
     return SDValue();
 
   ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
   const APInt &ConstValue = C->getAPIntValue();
 
   // Multiplication of a power of two plus/minus one can be done more
   // cheaply as as shift+add/sub. For now, this is true unilaterally. If
   // future CPUs have a cheaper MADD instruction, this may need to be
   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
   // 64-bit is 5 cycles, so this is always a win.
   // More aggressively, some multiplications N0 * C can be lowered to
   // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
   // e.g. 6=3*2=(2+1)*2.
   // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
   // which equals to (1+2)*16-(1+2).
   SDValue N0 = N->getOperand(0);
   // TrailingZeroes is used to test if the mul can be lowered to
   // shift+add+shift.
   unsigned TrailingZeroes = ConstValue.countTrailingZeros();
   if (TrailingZeroes) {
     // Conservatively do not lower to shift+add+shift if the mul might be
     // folded into smul or umul.
     if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
                             isZeroExtended(N0.getNode(), DAG)))
       return SDValue();
     // Conservatively do not lower to shift+add+shift if the mul might be
     // folded into madd or msub.
     if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
                            N->use_begin()->getOpcode() == ISD::SUB))
       return SDValue();
   }
   // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
   // and shift+add+shift.
   APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
 
   unsigned ShiftAmt, AddSubOpc;
   // Is the shifted value the LHS operand of the add/sub?
   bool ShiftValUseIsN0 = true;
   // Do we need to negate the result?
   bool NegateResult = false;
 
   if (ConstValue.isNonNegative()) {
     // (mul x, 2^N + 1) => (add (shl x, N), x)
     // (mul x, 2^N - 1) => (sub (shl x, N), x)
     // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
     APInt SCVMinus1 = ShiftedConstValue - 1;
     APInt CVPlus1 = ConstValue + 1;
     if (SCVMinus1.isPowerOf2()) {
       ShiftAmt = SCVMinus1.logBase2();
       AddSubOpc = ISD::ADD;
     } else if (CVPlus1.isPowerOf2()) {
       ShiftAmt = CVPlus1.logBase2();
       AddSubOpc = ISD::SUB;
     } else
       return SDValue();
   } else {
     // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
     // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
     APInt CVNegPlus1 = -ConstValue + 1;
     APInt CVNegMinus1 = -ConstValue - 1;
     if (CVNegPlus1.isPowerOf2()) {
       ShiftAmt = CVNegPlus1.logBase2();
       AddSubOpc = ISD::SUB;
       ShiftValUseIsN0 = false;
     } else if (CVNegMinus1.isPowerOf2()) {
       ShiftAmt = CVNegMinus1.logBase2();
       AddSubOpc = ISD::ADD;
       NegateResult = true;
     } else
       return SDValue();
   }
 
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
                                    DAG.getConstant(ShiftAmt, DL, MVT::i64));
 
   SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
   SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
   SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
   assert(!(NegateResult && TrailingZeroes) &&
          "NegateResult and TrailingZeroes cannot both be true for now.");
   // Negate the result.
   if (NegateResult)
     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
   // Shift the result.
   if (TrailingZeroes)
     return DAG.getNode(ISD::SHL, DL, VT, Res,
                        DAG.getConstant(TrailingZeroes, DL, MVT::i64));
   return Res;
 }
 
 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
                                                          SelectionDAG &DAG) {
   // Take advantage of vector comparisons producing 0 or -1 in each lane to
   // optimize away operation when it's from a constant.
   //
   // The general transformation is:
   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
   //       AND(VECTOR_CMP(x,y), constant2)
   //    constant2 = UNARYOP(constant)
 
   // Early exit if this isn't a vector operation, the operand of the
   // unary operation isn't a bitwise AND, or if the sizes of the operations
   // aren't the same.
   EVT VT = N->getValueType(0);
   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
     return SDValue();
 
   // Now check that the other operand of the AND is a constant. We could
   // make the transformation for non-constant splats as well, but it's unclear
   // that would be a benefit as it would not eliminate any operations, just
   // perform one more step in scalar code before moving to the vector unit.
   if (BuildVectorSDNode *BV =
           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
     // Bail out if the vector isn't a constant.
     if (!BV->isConstant())
       return SDValue();
 
     // Everything checks out. Build up the new and improved node.
     SDLoc DL(N);
     EVT IntVT = BV->getValueType(0);
     // Create a new constant of the appropriate type for the transformed
     // DAG.
     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
     // The AND node needs bitcasts to/from an integer vector type around it.
     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
                                  N->getOperand(0)->getOperand(0), MaskConst);
     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
     return Res;
   }
 
   return SDValue();
 }
 
 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
                                      const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
   // a constant. Vectors only.
   if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
     return Res;
 
   EVT VT = N->getValueType(0);
   if (VT != MVT::f32 && VT != MVT::f64)
     return SDValue();
 
   // Only optimize when the source and destination types have the same width.
   if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
     return SDValue();
 
   // If the result of an integer load is only used by an integer-to-float
   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
   // This eliminates an "integer-to-vector-move" UOP and improves throughput.
   SDValue N0 = N->getOperand(0);
   if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
                                LN0->getPointerInfo(), LN0->getAlignment(),
                                LN0->getMemOperand()->getFlags());
 
     // Make sure successors of the original load stay after it by updating them
     // to use the new Chain.
     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
 
     unsigned Opcode =
         (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
   }
 
   return SDValue();
 }
 
 /// Fold a floating-point multiply by power of two into floating-point to
 /// fixed-point conversion.
 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
   if (!Subtarget->hasNEON())
     return SDValue();
 
   SDValue Op = N->getOperand(0);
   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
       Op.getOpcode() != ISD::FMUL)
     return SDValue();
 
   SDValue ConstVec = Op->getOperand(1);
   if (!isa<BuildVectorSDNode>(ConstVec))
     return SDValue();
 
   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
   uint32_t FloatBits = FloatTy.getSizeInBits();
   if (FloatBits != 32 && FloatBits != 64)
     return SDValue();
 
   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
   uint32_t IntBits = IntTy.getSizeInBits();
   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
     return SDValue();
 
   // Avoid conversions where iN is larger than the float (e.g., float -> i64).
   if (IntBits > FloatBits)
     return SDValue();
 
   BitVector UndefElements;
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
   int32_t Bits = IntBits == 64 ? 64 : 32;
   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
   if (C == -1 || C == 0 || C > Bits)
     return SDValue();
 
   MVT ResTy;
   unsigned NumLanes = Op.getValueType().getVectorNumElements();
   switch (NumLanes) {
   default:
     return SDValue();
   case 2:
     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
     break;
   case 4:
     ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
     break;
   }
 
   if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
     return SDValue();
 
   assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
          "Illegal vector type after legalization");
 
   SDLoc DL(N);
   bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
                                       : Intrinsic::aarch64_neon_vcvtfp2fxu;
   SDValue FixConv =
       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
                   DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
                   Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
   // We can handle smaller integers by generating an extra trunc.
   if (IntBits < FloatBits)
     FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
 
   return FixConv;
 }
 
 /// Fold a floating-point divide by power of two into fixed-point to
 /// floating-point conversion.
 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const AArch64Subtarget *Subtarget) {
   if (!Subtarget->hasNEON())
     return SDValue();
 
   SDValue Op = N->getOperand(0);
   unsigned Opc = Op->getOpcode();
   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
       !Op.getOperand(0).getValueType().isSimple() ||
       (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
     return SDValue();
 
   SDValue ConstVec = N->getOperand(1);
   if (!isa<BuildVectorSDNode>(ConstVec))
     return SDValue();
 
   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
   int32_t IntBits = IntTy.getSizeInBits();
   if (IntBits != 16 && IntBits != 32 && IntBits != 64)
     return SDValue();
 
   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
   int32_t FloatBits = FloatTy.getSizeInBits();
   if (FloatBits != 32 && FloatBits != 64)
     return SDValue();
 
   // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
   if (IntBits > FloatBits)
     return SDValue();
 
   BitVector UndefElements;
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
   if (C == -1 || C == 0 || C > FloatBits)
     return SDValue();
 
   MVT ResTy;
   unsigned NumLanes = Op.getValueType().getVectorNumElements();
   switch (NumLanes) {
   default:
     return SDValue();
   case 2:
     ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
     break;
   case 4:
     ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
     break;
   }
 
   if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SDLoc DL(N);
   SDValue ConvInput = Op.getOperand(0);
   bool IsSigned = Opc == ISD::SINT_TO_FP;
   if (IntBits < FloatBits)
     ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
                             ResTy, ConvInput);
 
   unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
                                       : Intrinsic::aarch64_neon_vcvtfxu2fp;
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
                      DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
                      DAG.getConstant(C, DL, MVT::i32));
 }
 
 /// An EXTR instruction is made up of two shifts, ORed together. This helper
 /// searches for and classifies those shifts.
 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
                          bool &FromHi) {
   if (N.getOpcode() == ISD::SHL)
     FromHi = false;
   else if (N.getOpcode() == ISD::SRL)
     FromHi = true;
   else
     return false;
 
   if (!isa<ConstantSDNode>(N.getOperand(1)))
     return false;
 
   ShiftAmount = N->getConstantOperandVal(1);
   Src = N->getOperand(0);
   return true;
 }
 
 /// EXTR instruction extracts a contiguous chunk of bits from two existing
 /// registers viewed as a high/low pair. This function looks for the pattern:
 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
 /// with an EXTR. Can't quite be done in TableGen because the two immediates
 /// aren't independent.
 static SDValue tryCombineToEXTR(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   assert(N->getOpcode() == ISD::OR && "Unexpected root");
 
   if (VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
   SDValue LHS;
   uint32_t ShiftLHS = 0;
   bool LHSFromHi = false;
   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
     return SDValue();
 
   SDValue RHS;
   uint32_t ShiftRHS = 0;
   bool RHSFromHi = false;
   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
     return SDValue();
 
   // If they're both trying to come from the high part of the register, they're
   // not really an EXTR.
   if (LHSFromHi == RHSFromHi)
     return SDValue();
 
   if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
     return SDValue();
 
   if (LHSFromHi) {
     std::swap(LHS, RHS);
     std::swap(ShiftLHS, ShiftRHS);
   }
 
   return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
                      DAG.getConstant(ShiftRHS, DL, MVT::i64));
 }
 
 static SDValue tryCombineToBSL(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI) {
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
 
   if (!VT.isVector())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   if (N0.getOpcode() != ISD::AND)
     return SDValue();
 
   SDValue N1 = N->getOperand(1);
   if (N1.getOpcode() != ISD::AND)
     return SDValue();
 
   // We only have to look for constant vectors here since the general, variable
   // case can be handled in TableGen.
   unsigned Bits = VT.getScalarSizeInBits();
   uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
   for (int i = 1; i >= 0; --i)
     for (int j = 1; j >= 0; --j) {
       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
       if (!BVN0 || !BVN1)
         continue;
 
       bool FoundMatch = true;
       for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
         ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
         ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
         if (!CN0 || !CN1 ||
             CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
           FoundMatch = false;
           break;
         }
       }
 
       if (FoundMatch)
         return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
     }
 
   return SDValue();
 }
 
 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                 const AArch64Subtarget *Subtarget) {
   // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   if (SDValue Res = tryCombineToEXTR(N, DCI))
     return Res;
 
   if (SDValue Res = tryCombineToBSL(N, DCI))
     return Res;
 
   return SDValue();
 }
 
 static SDValue performSRLCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
   if (VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
   // Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
   // high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
   // to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
   SDValue N0 = N->getOperand(0);
   if (N0.getOpcode() == ISD::BSWAP) {
     SDLoc DL(N);
     SDValue N1 = N->getOperand(1);
     SDValue N00 = N0.getOperand(0);
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
       uint64_t ShiftAmt = C->getZExtValue();
       if (VT == MVT::i32 && ShiftAmt == 16 &&
           DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
         return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
       if (VT == MVT::i64 && ShiftAmt == 32 &&
           DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
         return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
     }
   }
   return SDValue();
 }
 
 static SDValue performBitcastCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      SelectionDAG &DAG) {
   // Wait 'til after everything is legalized to try this. That way we have
   // legal vector types and such.
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   // Remove extraneous bitcasts around an extract_subvector.
   // For example,
   //    (v4i16 (bitconvert
   //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
   //  becomes
   //    (extract_subvector ((v8i16 ...), (i64 4)))
 
   // Only interested in 64-bit vectors as the ultimate result.
   EVT VT = N->getValueType(0);
   if (!VT.isVector())
     return SDValue();
   if (VT.getSimpleVT().getSizeInBits() != 64)
     return SDValue();
   // Is the operand an extract_subvector starting at the beginning or halfway
   // point of the vector? A low half may also come through as an
   // EXTRACT_SUBREG, so look for that, too.
   SDValue Op0 = N->getOperand(0);
   if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
       !(Op0->isMachineOpcode() &&
         Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
     return SDValue();
   uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
   if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
     if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
       return SDValue();
   } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
     if (idx != AArch64::dsub)
       return SDValue();
     // The dsub reference is equivalent to a lane zero subvector reference.
     idx = 0;
   }
   // Look through the bitcast of the input to the extract.
   if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
     return SDValue();
   SDValue Source = Op0->getOperand(0)->getOperand(0);
   // If the source type has twice the number of elements as our destination
   // type, we know this is an extract of the high or low half of the vector.
   EVT SVT = Source->getValueType(0);
   if (!SVT.isVector() ||
       SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
     return SDValue();
 
   LLVM_DEBUG(
       dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
 
   // Create the simplified form to just extract the low or high half of the
   // vector directly rather than bothering with the bitcasts.
   SDLoc dl(N);
   unsigned NumElements = VT.getVectorNumElements();
   if (idx) {
     SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
   } else {
     SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
     return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
                                       Source, SubReg),
                    0);
   }
 }
 
 static SDValue performConcatVectorsCombine(SDNode *N,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            SelectionDAG &DAG) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
 
   // Optimize concat_vectors of truncated vectors, where the intermediate
   // type is illegal, to avoid said illegality,  e.g.,
   //   (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
   //                          (v2i16 (truncate (v2i64)))))
   // ->
   //   (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
   //                                    (v4i32 (bitcast (v2i64))),
   //                                    <0, 2, 4, 6>)))
   // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
   // on both input and result type, so we might generate worse code.
   // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
   if (N->getNumOperands() == 2 &&
       N0->getOpcode() == ISD::TRUNCATE &&
       N1->getOpcode() == ISD::TRUNCATE) {
     SDValue N00 = N0->getOperand(0);
     SDValue N10 = N1->getOperand(0);
     EVT N00VT = N00.getValueType();
 
     if (N00VT == N10.getValueType() &&
         (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
         N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
       MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
       SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
       for (size_t i = 0; i < Mask.size(); ++i)
         Mask[i] = i * 2;
       return DAG.getNode(ISD::TRUNCATE, dl, VT,
                          DAG.getVectorShuffle(
                              MidVT, dl,
                              DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
                              DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
     }
   }
 
   // Wait 'til after everything is legalized to try this. That way we have
   // legal vector types and such.
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
   // canonicalise to that.
   if (N0 == N1 && VT.getVectorNumElements() == 2) {
     assert(VT.getScalarSizeInBits() == 64);
     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
                        DAG.getConstant(0, dl, MVT::i64));
   }
 
   // Canonicalise concat_vectors so that the right-hand vector has as few
   // bit-casts as possible before its real operation. The primary matching
   // destination for these operations will be the narrowing "2" instructions,
   // which depend on the operation being performed on this right-hand vector.
   // For example,
   //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
   // becomes
   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
 
   if (N1->getOpcode() != ISD::BITCAST)
     return SDValue();
   SDValue RHS = N1->getOperand(0);
   MVT RHSTy = RHS.getValueType().getSimpleVT();
   // If the RHS is not a vector, this is not the pattern we're looking for.
   if (!RHSTy.isVector())
     return SDValue();
 
   LLVM_DEBUG(
       dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
 
   MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
                                   RHSTy.getVectorNumElements() * 2);
   return DAG.getNode(ISD::BITCAST, dl, VT,
                      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
                                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
                                  RHS));
 }
 
 static SDValue tryCombineFixedPointConvert(SDNode *N,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            SelectionDAG &DAG) {
   // Wait until after everything is legalized to try this. That way we have
   // legal vector types and such.
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
   // Transform a scalar conversion of a value from a lane extract into a
   // lane extract of a vector conversion. E.g., from foo1 to foo2:
   // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
   // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
   //
   // The second form interacts better with instruction selection and the
   // register allocator to avoid cross-class register copies that aren't
   // coalescable due to a lane reference.
 
   // Check the operand and see if it originates from a lane extract.
   SDValue Op1 = N->getOperand(1);
   if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     // Yep, no additional predication needed. Perform the transform.
     SDValue IID = N->getOperand(0);
     SDValue Shift = N->getOperand(2);
     SDValue Vec = Op1.getOperand(0);
     SDValue Lane = Op1.getOperand(1);
     EVT ResTy = N->getValueType(0);
     EVT VecResTy;
     SDLoc DL(N);
 
     // The vector width should be 128 bits by the time we get here, even
     // if it started as 64 bits (the extract_vector handling will have
     // done so).
     assert(Vec.getValueSizeInBits() == 128 &&
            "unexpected vector size on extract_vector_elt!");
     if (Vec.getValueType() == MVT::v4i32)
       VecResTy = MVT::v4f32;
     else if (Vec.getValueType() == MVT::v2i64)
       VecResTy = MVT::v2f64;
     else
       llvm_unreachable("unexpected vector type!");
 
     SDValue Convert =
         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
   }
   return SDValue();
 }
 
 // AArch64 high-vector "long" operations are formed by performing the non-high
 // version on an extract_subvector of each operand which gets the high half:
 //
 //  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
 //
 // However, there are cases which don't have an extract_high explicitly, but
 // have another operation that can be made compatible with one for free. For
 // example:
 //
 //  (dupv64 scalar) --> (extract_high (dup128 scalar))
 //
 // This routine does the actual conversion of such DUPs, once outer routines
 // have determined that everything else is in order.
 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
 // similarly here.
 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
   switch (N.getOpcode()) {
   case AArch64ISD::DUP:
   case AArch64ISD::DUPLANE8:
   case AArch64ISD::DUPLANE16:
   case AArch64ISD::DUPLANE32:
   case AArch64ISD::DUPLANE64:
   case AArch64ISD::MOVI:
   case AArch64ISD::MOVIshift:
   case AArch64ISD::MOVIedit:
   case AArch64ISD::MOVImsl:
   case AArch64ISD::MVNIshift:
   case AArch64ISD::MVNImsl:
     break;
   default:
     // FMOV could be supported, but isn't very useful, as it would only occur
     // if you passed a bitcast' floating point immediate to an eligible long
     // integer op (addl, smull, ...).
     return SDValue();
   }
 
   MVT NarrowTy = N.getSimpleValueType();
   if (!NarrowTy.is64BitVector())
     return SDValue();
 
   MVT ElementTy = NarrowTy.getVectorElementType();
   unsigned NumElems = NarrowTy.getVectorNumElements();
   MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
 
   SDLoc dl(N);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
                      DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
                      DAG.getConstant(NumElems, dl, MVT::i64));
 }
 
 static bool isEssentiallyExtractSubvector(SDValue N) {
   if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
     return true;
 
   return N.getOpcode() == ISD::BITCAST &&
          N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
 }
 
 /// Helper structure to keep track of ISD::SET_CC operands.
 struct GenericSetCCInfo {
   const SDValue *Opnd0;
   const SDValue *Opnd1;
   ISD::CondCode CC;
 };
 
 /// Helper structure to keep track of a SET_CC lowered into AArch64 code.
 struct AArch64SetCCInfo {
   const SDValue *Cmp;
   AArch64CC::CondCode CC;
 };
 
 /// Helper structure to keep track of SetCC information.
 union SetCCInfo {
   GenericSetCCInfo Generic;
   AArch64SetCCInfo AArch64;
 };
 
 /// Helper structure to be able to read SetCC information.  If set to
 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
 /// GenericSetCCInfo.
 struct SetCCInfoAndKind {
   SetCCInfo Info;
   bool IsAArch64;
 };
 
 /// Check whether or not \p Op is a SET_CC operation, either a generic or
 /// an
 /// AArch64 lowered one.
 /// \p SetCCInfo is filled accordingly.
 /// \post SetCCInfo is meanginfull only when this function returns true.
 /// \return True when Op is a kind of SET_CC operation.
 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
   // If this is a setcc, this is straight forward.
   if (Op.getOpcode() == ISD::SETCC) {
     SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
     SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
     SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
     SetCCInfo.IsAArch64 = false;
     return true;
   }
   // Otherwise, check if this is a matching csel instruction.
   // In other words:
   // - csel 1, 0, cc
   // - csel 0, 1, !cc
   if (Op.getOpcode() != AArch64ISD::CSEL)
     return false;
   // Set the information about the operands.
   // TODO: we want the operands of the Cmp not the csel
   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
   SetCCInfo.IsAArch64 = true;
   SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
 
   // Check that the operands matches the constraints:
   // (1) Both operands must be constants.
   // (2) One must be 1 and the other must be 0.
   ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
   ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
 
   // Check (1).
   if (!TValue || !FValue)
     return false;
 
   // Check (2).
   if (!TValue->isOne()) {
     // Update the comparison when we are interested in !cc.
     std::swap(TValue, FValue);
     SetCCInfo.Info.AArch64.CC =
         AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
   }
   return TValue->isOne() && FValue->isNullValue();
 }
 
 // Returns true if Op is setcc or zext of setcc.
 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
   if (isSetCC(Op, Info))
     return true;
   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
     isSetCC(Op->getOperand(0), Info));
 }
 
 // The folding we want to perform is:
 // (add x, [zext] (setcc cc ...) )
 //   -->
 // (csel x, (add x, 1), !cc ...)
 //
 // The latter will get matched to a CSINC instruction.
 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
   assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
   SDValue LHS = Op->getOperand(0);
   SDValue RHS = Op->getOperand(1);
   SetCCInfoAndKind InfoAndKind;
 
   // If neither operand is a SET_CC, give up.
   if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
     std::swap(LHS, RHS);
     if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
       return SDValue();
   }
 
   // FIXME: This could be generatized to work for FP comparisons.
   EVT CmpVT = InfoAndKind.IsAArch64
                   ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
                   : InfoAndKind.Info.Generic.Opnd0->getValueType();
   if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
     return SDValue();
 
   SDValue CCVal;
   SDValue Cmp;
   SDLoc dl(Op);
   if (InfoAndKind.IsAArch64) {
     CCVal = DAG.getConstant(
         AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
         MVT::i32);
     Cmp = *InfoAndKind.Info.AArch64.Cmp;
   } else
     Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
                       *InfoAndKind.Info.Generic.Opnd1,
                       ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
                       CCVal, DAG, dl);
 
   EVT VT = Op->getValueType(0);
   LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
   return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
 }
 
 // The basic add/sub long vector instructions have variants with "2" on the end
 // which act on the high-half of their inputs. They are normally matched by
 // patterns like:
 //
 // (add (zeroext (extract_high LHS)),
 //      (zeroext (extract_high RHS)))
 // -> uaddl2 vD, vN, vM
 //
 // However, if one of the extracts is something like a duplicate, this
 // instruction can still be used profitably. This function puts the DAG into a
 // more appropriate form for those patterns to trigger.
 static SDValue performAddSubLongCombine(SDNode *N,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         SelectionDAG &DAG) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   MVT VT = N->getSimpleValueType(0);
   if (!VT.is128BitVector()) {
     if (N->getOpcode() == ISD::ADD)
       return performSetccAddFolding(N, DAG);
     return SDValue();
   }
 
   // Make sure both branches are extended in the same way.
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
        LHS.getOpcode() != ISD::SIGN_EXTEND) ||
       LHS.getOpcode() != RHS.getOpcode())
     return SDValue();
 
   unsigned ExtType = LHS.getOpcode();
 
   // It's not worth doing if at least one of the inputs isn't already an
   // extract, but we don't know which it'll be so we have to try both.
   if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
     RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
     if (!RHS.getNode())
       return SDValue();
 
     RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
   } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
     LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
     if (!LHS.getNode())
       return SDValue();
 
     LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
   }
 
   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
 }
 
 // Massage DAGs which we can use the high-half "long" operations on into
 // something isel will recognize better. E.g.
 //
 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
 //   (aarch64_neon_umull (extract_high (v2i64 vec)))
 //                     (extract_high (v2i64 (dup128 scalar)))))
 //
 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        SelectionDAG &DAG) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   assert(LHS.getValueType().is64BitVector() &&
          RHS.getValueType().is64BitVector() &&
          "unexpected shape for long operation");
 
   // Either node could be a DUP, but it's not worth doing both of them (you'd
   // just as well use the non-high version) so look for a corresponding extract
   // operation on the other "wing".
   if (isEssentiallyExtractSubvector(LHS)) {
     RHS = tryExtendDUPToExtractHigh(RHS, DAG);
     if (!RHS.getNode())
       return SDValue();
   } else if (isEssentiallyExtractSubvector(RHS)) {
     LHS = tryExtendDUPToExtractHigh(LHS, DAG);
     if (!LHS.getNode())
       return SDValue();
   }
 
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
                      N->getOperand(0), LHS, RHS);
 }
 
 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
   MVT ElemTy = N->getSimpleValueType(0).getScalarType();
   unsigned ElemBits = ElemTy.getSizeInBits();
 
   int64_t ShiftAmount;
   if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
     APInt SplatValue, SplatUndef;
     unsigned SplatBitSize;
     bool HasAnyUndefs;
     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
                               HasAnyUndefs, ElemBits) ||
         SplatBitSize != ElemBits)
       return SDValue();
 
     ShiftAmount = SplatValue.getSExtValue();
   } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
     ShiftAmount = CVN->getSExtValue();
   } else
     return SDValue();
 
   unsigned Opcode;
   bool IsRightShift;
   switch (IID) {
   default:
     llvm_unreachable("Unknown shift intrinsic");
   case Intrinsic::aarch64_neon_sqshl:
     Opcode = AArch64ISD::SQSHL_I;
     IsRightShift = false;
     break;
   case Intrinsic::aarch64_neon_uqshl:
     Opcode = AArch64ISD::UQSHL_I;
     IsRightShift = false;
     break;
   case Intrinsic::aarch64_neon_srshl:
     Opcode = AArch64ISD::SRSHR_I;
     IsRightShift = true;
     break;
   case Intrinsic::aarch64_neon_urshl:
     Opcode = AArch64ISD::URSHR_I;
     IsRightShift = true;
     break;
   case Intrinsic::aarch64_neon_sqshlu:
     Opcode = AArch64ISD::SQSHLU_I;
     IsRightShift = false;
     break;
   }
 
   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
     SDLoc dl(N);
     return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
                        DAG.getConstant(-ShiftAmount, dl, MVT::i32));
   } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
     SDLoc dl(N);
     return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
                        DAG.getConstant(ShiftAmount, dl, MVT::i32));
   }
 
   return SDValue();
 }
 
 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
 // the intrinsics must be legal and take an i32, this means there's almost
 // certainly going to be a zext in the DAG which we can eliminate.
 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
   SDValue AndN = N->getOperand(2);
   if (AndN.getOpcode() != ISD::AND)
     return SDValue();
 
   ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
   if (!CMask || CMask->getZExtValue() != Mask)
     return SDValue();
 
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
                      N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
 }
 
 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
                                            SelectionDAG &DAG) {
   SDLoc dl(N);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
                      DAG.getNode(Opc, dl,
                                  N->getOperand(1).getSimpleValueType(),
                                  N->getOperand(1)),
                      DAG.getConstant(0, dl, MVT::i64));
 }
 
 static SDValue performIntrinsicCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const AArch64Subtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
   unsigned IID = getIntrinsicID(N);
   switch (IID) {
   default:
     break;
   case Intrinsic::aarch64_neon_vcvtfxs2fp:
   case Intrinsic::aarch64_neon_vcvtfxu2fp:
     return tryCombineFixedPointConvert(N, DCI, DAG);
   case Intrinsic::aarch64_neon_saddv:
     return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
   case Intrinsic::aarch64_neon_uaddv:
     return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
   case Intrinsic::aarch64_neon_sminv:
     return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
   case Intrinsic::aarch64_neon_uminv:
     return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
   case Intrinsic::aarch64_neon_smaxv:
     return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
   case Intrinsic::aarch64_neon_umaxv:
     return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
   case Intrinsic::aarch64_neon_fmax:
     return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_fmin:
     return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_fmaxnm:
     return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_fminnm:
     return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_smull:
   case Intrinsic::aarch64_neon_umull:
   case Intrinsic::aarch64_neon_pmull:
   case Intrinsic::aarch64_neon_sqdmull:
     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
   case Intrinsic::aarch64_neon_sqshl:
   case Intrinsic::aarch64_neon_uqshl:
   case Intrinsic::aarch64_neon_sqshlu:
   case Intrinsic::aarch64_neon_srshl:
   case Intrinsic::aarch64_neon_urshl:
     return tryCombineShiftImm(IID, N, DAG);
   case Intrinsic::aarch64_crc32b:
   case Intrinsic::aarch64_crc32cb:
     return tryCombineCRC32(0xff, N, DAG);
   case Intrinsic::aarch64_crc32h:
   case Intrinsic::aarch64_crc32ch:
     return tryCombineCRC32(0xffff, N, DAG);
   }
   return SDValue();
 }
 
 static SDValue performExtendCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
   // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
   // we can convert that DUP into another extract_high (of a bigger DUP), which
   // helps the backend to decide that an sabdl2 would be useful, saving a real
   // extract_high operation.
   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
       N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
     SDNode *ABDNode = N->getOperand(0).getNode();
     unsigned IID = getIntrinsicID(ABDNode);
     if (IID == Intrinsic::aarch64_neon_sabd ||
         IID == Intrinsic::aarch64_neon_uabd) {
       SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
       if (!NewABD.getNode())
         return SDValue();
 
       return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
                          NewABD);
     }
   }
 
   // This is effectively a custom type legalization for AArch64.
   //
   // Type legalization will split an extend of a small, legal, type to a larger
   // illegal type by first splitting the destination type, often creating
   // illegal source types, which then get legalized in isel-confusing ways,
   // leading to really terrible codegen. E.g.,
   //   %result = v8i32 sext v8i8 %value
   // becomes
   //   %losrc = extract_subreg %value, ...
   //   %hisrc = extract_subreg %value, ...
   //   %lo = v4i32 sext v4i8 %losrc
   //   %hi = v4i32 sext v4i8 %hisrc
   // Things go rapidly downhill from there.
   //
   // For AArch64, the [sz]ext vector instructions can only go up one element
   // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
   // take two instructions.
   //
   // This implies that the most efficient way to do the extend from v8i8
   // to two v4i32 values is to first extend the v8i8 to v8i16, then do
   // the normal splitting to happen for the v8i16->v8i32.
 
   // This is pre-legalization to catch some cases where the default
   // type legalization will create ill-tempered code.
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
   // We're only interested in cleaning things up for non-legal vector types
   // here. If both the source and destination are legal, things will just
   // work naturally without any fiddling.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT ResVT = N->getValueType(0);
   if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
     return SDValue();
   // If the vector type isn't a simple VT, it's beyond the scope of what
   // we're  worried about here. Let legalization do its thing and hope for
   // the best.
   SDValue Src = N->getOperand(0);
   EVT SrcVT = Src->getValueType(0);
   if (!ResVT.isSimple() || !SrcVT.isSimple())
     return SDValue();
 
   // If the source VT is a 64-bit vector, we can play games and get the
   // better results we want.
   if (SrcVT.getSizeInBits() != 64)
     return SDValue();
 
   unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
   unsigned ElementCount = SrcVT.getVectorNumElements();
   SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
   SDLoc DL(N);
   Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
 
   // Now split the rest of the operation into two halves, each with a 64
   // bit source.
   EVT LoVT, HiVT;
   SDValue Lo, Hi;
   unsigned NumElements = ResVT.getVectorNumElements();
   assert(!(NumElements & 1) && "Splitting vector, but not in half!");
   LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
                                  ResVT.getVectorElementType(), NumElements / 2);
 
   EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
                                LoVT.getVectorNumElements());
   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
                    DAG.getConstant(0, DL, MVT::i64));
   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
                    DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
 
   // Now combine the parts back together so we still have a single result
   // like the combiner expects.
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
 }
 
 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
                                SDValue SplatVal, unsigned NumVecElts) {
   unsigned OrigAlignment = St.getAlignment();
   unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
 
   // Create scalar stores. This is at least as good as the code sequence for a
   // split unaligned store which is a dup.s, ext.b, and two stores.
   // Most of the time the three stores should be replaced by store pair
   // instructions (stp).
   SDLoc DL(&St);
   SDValue BasePtr = St.getBasePtr();
   uint64_t BaseOffset = 0;
 
   const MachinePointerInfo &PtrInfo = St.getPointerInfo();
   SDValue NewST1 =
       DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
                    OrigAlignment, St.getMemOperand()->getFlags());
 
   // As this in ISel, we will not merge this add which may degrade results.
   if (BasePtr->getOpcode() == ISD::ADD &&
       isa<ConstantSDNode>(BasePtr->getOperand(1))) {
     BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
     BasePtr = BasePtr->getOperand(0);
   }
 
   unsigned Offset = EltOffset;
   while (--NumVecElts) {
     unsigned Alignment = MinAlign(OrigAlignment, Offset);
     SDValue OffsetPtr =
         DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                     DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
                           PtrInfo.getWithOffset(Offset), Alignment,
                           St.getMemOperand()->getFlags());
     Offset += EltOffset;
   }
   return NewST1;
 }
 
 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR.  The
 /// load store optimizer pass will merge them to store pair stores.  This should
 /// be better than a movi to create the vector zero followed by a vector store
 /// if the zero constant is not re-used, since one instructions and one register
 /// live range will be removed.
 ///
 /// For example, the final generated code should be:
 ///
 ///   stp xzr, xzr, [x0]
 ///
 /// instead of:
 ///
 ///   movi v0.2d, #0
 ///   str q0, [x0]
 ///
 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
   SDValue StVal = St.getValue();
   EVT VT = StVal.getValueType();
 
   // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
   // 2, 3 or 4 i32 elements.
   int NumVecElts = VT.getVectorNumElements();
   if (!(((NumVecElts == 2 || NumVecElts == 3) &&
          VT.getVectorElementType().getSizeInBits() == 64) ||
         ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
          VT.getVectorElementType().getSizeInBits() == 32)))
     return SDValue();
 
   if (StVal.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
 
   // If the zero constant has more than one use then the vector store could be
   // better since the constant mov will be amortized and stp q instructions
   // should be able to be formed.
   if (!StVal.hasOneUse())
     return SDValue();
 
   // If the immediate offset of the address operand is too large for the stp
   // instruction, then bail out.
   if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
     int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
     if (Offset < -512 || Offset > 504)
       return SDValue();
   }
 
   for (int I = 0; I < NumVecElts; ++I) {
     SDValue EltVal = StVal.getOperand(I);
     if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
       return SDValue();
   }
 
   // Use a CopyFromReg WZR/XZR here to prevent
   // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
   SDLoc DL(&St);
   unsigned ZeroReg;
   EVT ZeroVT;
   if (VT.getVectorElementType().getSizeInBits() == 32) {
     ZeroReg = AArch64::WZR;
     ZeroVT = MVT::i32;
   } else {
     ZeroReg = AArch64::XZR;
     ZeroVT = MVT::i64;
   }
   SDValue SplatVal =
       DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
 }
 
 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
 /// value. The load store optimizer pass will merge them to store pair stores.
 /// This has better performance than a splat of the scalar followed by a split
 /// vector store. Even if the stores are not merged it is four stores vs a dup,
 /// followed by an ext.b and two stores.
 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
   SDValue StVal = St.getValue();
   EVT VT = StVal.getValueType();
 
   // Don't replace floating point stores, they possibly won't be transformed to
   // stp because of the store pair suppress pass.
   if (VT.isFloatingPoint())
     return SDValue();
 
   // We can express a splat as store pair(s) for 2 or 4 elements.
   unsigned NumVecElts = VT.getVectorNumElements();
   if (NumVecElts != 4 && NumVecElts != 2)
     return SDValue();
 
   // Check that this is a splat.
   // Make sure that each of the relevant vector element locations are inserted
   // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
   std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
   SDValue SplatVal;
   for (unsigned I = 0; I < NumVecElts; ++I) {
     // Check for insert vector elements.
     if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
       return SDValue();
 
     // Check that same value is inserted at each vector element.
     if (I == 0)
       SplatVal = StVal.getOperand(1);
     else if (StVal.getOperand(1) != SplatVal)
       return SDValue();
 
     // Check insert element index.
     ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
     if (!CIndex)
       return SDValue();
     uint64_t IndexVal = CIndex->getZExtValue();
     if (IndexVal >= NumVecElts)
       return SDValue();
     IndexNotInserted.reset(IndexVal);
 
     StVal = StVal.getOperand(0);
   }
   // Check that all vector element locations were inserted to.
   if (IndexNotInserted.any())
       return SDValue();
 
   return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
 }
 
 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                            SelectionDAG &DAG,
                            const AArch64Subtarget *Subtarget) {
 
   StoreSDNode *S = cast<StoreSDNode>(N);
   if (S->isVolatile() || S->isIndexed())
     return SDValue();
 
   SDValue StVal = S->getValue();
   EVT VT = StVal.getValueType();
   if (!VT.isVector())
     return SDValue();
 
   // If we get a splat of zeros, convert this vector store to a store of
   // scalars. They will be merged into store pairs of xzr thereby removing one
   // instruction and one register.
   if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
     return ReplacedZeroSplat;
 
   // FIXME: The logic for deciding if an unaligned store should be split should
   // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
   // a call to that function here.
 
   if (!Subtarget->isMisaligned128StoreSlow())
     return SDValue();
 
   // Don't split at -Oz.
   if (DAG.getMachineFunction().getFunction().optForMinSize())
     return SDValue();
 
   // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
   // those up regresses performance on micro-benchmarks and olden/bh.
   if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
     return SDValue();
 
   // Split unaligned 16B stores. They are terrible for performance.
   // Don't split stores with alignment of 1 or 2. Code that uses clang vector
   // extensions can use this to mark that it does not want splitting to happen
   // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
   // eliminating alignment hazards is only 1 in 8 for alignment of 2.
   if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
       S->getAlignment() <= 2)
     return SDValue();
 
   // If we get a splat of a scalar convert this vector store to a store of
   // scalars. They will be merged into store pairs thereby removing two
   // instructions.
   if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
     return ReplacedSplat;
 
   SDLoc DL(S);
   unsigned NumElts = VT.getVectorNumElements() / 2;
   // Split VT into two.
   EVT HalfVT =
       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
                                    DAG.getConstant(0, DL, MVT::i64));
   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
                                    DAG.getConstant(NumElts, DL, MVT::i64));
   SDValue BasePtr = S->getBasePtr();
   SDValue NewST1 =
       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
                    S->getAlignment(), S->getMemOperand()->getFlags());
   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                   DAG.getConstant(8, DL, MVT::i64));
   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
                       S->getPointerInfo(), S->getAlignment(),
                       S->getMemOperand()->getFlags());
 }
 
 /// Target-specific DAG combine function for post-increment LD1 (lane) and
 /// post-increment LD1R.
 static SDValue performPostLD1Combine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      bool IsLaneOp) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
   unsigned LoadIdx = IsLaneOp ? 1 : 0;
   SDNode *LD = N->getOperand(LoadIdx).getNode();
   // If it is not LOAD, can not do such combine.
   if (LD->getOpcode() != ISD::LOAD)
     return SDValue();
 
   // The vector lane must be a constant in the LD1LANE opcode.
   SDValue Lane;
   if (IsLaneOp) {
     Lane = N->getOperand(2);
     auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
     if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
       return SDValue();
   }
 
   LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
   EVT MemVT = LoadSDN->getMemoryVT();
   // Check if memory operand is the same type as the vector element.
   if (MemVT != VT.getVectorElementType())
     return SDValue();
 
   // Check if there are other uses. If so, do not combine as it will introduce
   // an extra load.
   for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
        ++UI) {
     if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
       continue;
     if (*UI != N)
       return SDValue();
   }
 
   SDValue Addr = LD->getOperand(1);
   SDValue Vector = N->getOperand(0);
   // Search for a use of the address operand that is an increment.
   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
        Addr.getNode()->use_end(); UI != UE; ++UI) {
     SDNode *User = *UI;
     if (User->getOpcode() != ISD::ADD
         || UI.getUse().getResNo() != Addr.getResNo())
       continue;
 
     // Check that the add is independent of the load.  Otherwise, folding it
     // would create a cycle.
     if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
       continue;
     // Also check that add is not used in the vector operand.  This would also
     // create a cycle.
     if (User->isPredecessorOf(Vector.getNode()))
       continue;
 
     // If the increment is a constant, it must match the memory ref size.
     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
       uint32_t IncVal = CInc->getZExtValue();
       unsigned NumBytes = VT.getScalarSizeInBits() / 8;
       if (IncVal != NumBytes)
         continue;
       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
     }
 
     // Finally, check that the vector doesn't depend on the load.
     // Again, this would create a cycle.
     // The load depending on the vector is fine, as that's the case for the
     // LD1*post we'll eventually generate anyway.
     if (LoadSDN->isPredecessorOf(Vector.getNode()))
       continue;
 
     SmallVector<SDValue, 8> Ops;
     Ops.push_back(LD->getOperand(0));  // Chain
     if (IsLaneOp) {
       Ops.push_back(Vector);           // The vector to be inserted
       Ops.push_back(Lane);             // The lane to be inserted in the vector
     }
     Ops.push_back(Addr);
     Ops.push_back(Inc);
 
     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
     SDVTList SDTys = DAG.getVTList(Tys);
     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
                                            MemVT,
                                            LoadSDN->getMemOperand());
 
     // Update the uses.
     SDValue NewResults[] = {
         SDValue(LD, 0),            // The result of load
         SDValue(UpdN.getNode(), 2) // Chain
     };
     DCI.CombineTo(LD, NewResults);
     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
 
     break;
   }
   return SDValue();
 }
 
 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
 /// address translation.
 static bool performTBISimplification(SDValue Addr,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      SelectionDAG &DAG) {
   APInt DemandedMask = APInt::getLowBitsSet(64, 56);
   KnownBits Known;
   TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                         !DCI.isBeforeLegalizeOps());
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
     DCI.CommitTargetLoweringOpt(TLO);
     return true;
   }
   return false;
 }
 
 static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
                                    const AArch64Subtarget *Subtarget) {
   if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
     return Split;
 
   if (Subtarget->supportsAddressTopByteIgnored() &&
       performTBISimplification(N->getOperand(2), DCI, DAG))
     return SDValue(N, 0);
 
   return SDValue();
 }
 
 
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
 static SDValue performNEONPostLDSTCombine(SDNode *N,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           SelectionDAG &DAG) {
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
   unsigned AddrOpIdx = N->getNumOperands() - 1;
   SDValue Addr = N->getOperand(AddrOpIdx);
 
   // Search for a use of the address operand that is an increment.
   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
        UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
     SDNode *User = *UI;
     if (User->getOpcode() != ISD::ADD ||
         UI.getUse().getResNo() != Addr.getResNo())
       continue;
 
     // Check that the add is independent of the load/store.  Otherwise, folding
     // it would create a cycle.
     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
       continue;
 
     // Find the new opcode for the updating load/store.
     bool IsStore = false;
     bool IsLaneOp = false;
     bool IsDupOp = false;
     unsigned NewOpc = 0;
     unsigned NumVecs = 0;
     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
     switch (IntNo) {
     default: llvm_unreachable("unexpected intrinsic for Neon base update");
     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
       NumVecs = 2; break;
     case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
       NumVecs = 3; break;
     case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
       NumVecs = 4; break;
     case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
       NumVecs = 2; IsStore = true; break;
     case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
       NumVecs = 3; IsStore = true; break;
     case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
       NumVecs = 4; IsStore = true; break;
     case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
       NumVecs = 2; break;
     case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
       NumVecs = 3; break;
     case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
       NumVecs = 4; break;
     case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
       NumVecs = 2; IsStore = true; break;
     case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
       NumVecs = 3; IsStore = true; break;
     case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
       NumVecs = 4; IsStore = true; break;
     case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
       NumVecs = 2; IsDupOp = true; break;
     case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
       NumVecs = 3; IsDupOp = true; break;
     case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
       NumVecs = 4; IsDupOp = true; break;
     case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
       NumVecs = 2; IsLaneOp = true; break;
     case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
       NumVecs = 3; IsLaneOp = true; break;
     case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
       NumVecs = 4; IsLaneOp = true; break;
     case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
       NumVecs = 2; IsStore = true; IsLaneOp = true; break;
     case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
       NumVecs = 3; IsStore = true; IsLaneOp = true; break;
     case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
       NumVecs = 4; IsStore = true; IsLaneOp = true; break;
     }
 
     EVT VecTy;
     if (IsStore)
       VecTy = N->getOperand(2).getValueType();
     else
       VecTy = N->getValueType(0);
 
     // If the increment is a constant, it must match the memory ref size.
     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
       uint32_t IncVal = CInc->getZExtValue();
       unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
       if (IsLaneOp || IsDupOp)
         NumBytes /= VecTy.getVectorNumElements();
       if (IncVal != NumBytes)
         continue;
       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
     }
     SmallVector<SDValue, 8> Ops;
     Ops.push_back(N->getOperand(0)); // Incoming chain
     // Load lane and store have vector list as input.
     if (IsLaneOp || IsStore)
       for (unsigned i = 2; i < AddrOpIdx; ++i)
         Ops.push_back(N->getOperand(i));
     Ops.push_back(Addr); // Base register
     Ops.push_back(Inc);
 
     // Return Types.
     EVT Tys[6];
     unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
     unsigned n;
     for (n = 0; n < NumResultVecs; ++n)
       Tys[n] = VecTy;
     Tys[n++] = MVT::i64;  // Type of write back register
     Tys[n] = MVT::Other;  // Type of the chain
     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
 
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
                                            MemInt->getMemoryVT(),
                                            MemInt->getMemOperand());
 
     // Update the uses.
     std::vector<SDValue> NewResults;
     for (unsigned i = 0; i < NumResultVecs; ++i) {
       NewResults.push_back(SDValue(UpdN.getNode(), i));
     }
     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
     DCI.CombineTo(N, NewResults);
     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
 
     break;
   }
   return SDValue();
 }
 
 // Checks to see if the value is the prescribed width and returns information
 // about its extension mode.
 static
 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
   ExtType = ISD::NON_EXTLOAD;
   switch(V.getNode()->getOpcode()) {
   default:
     return false;
   case ISD::LOAD: {
     LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
     if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
        || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
       ExtType = LoadNode->getExtensionType();
       return true;
     }
     return false;
   }
   case ISD::AssertSext: {
     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
     if ((TypeNode->getVT() == MVT::i8 && width == 8)
        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
       ExtType = ISD::SEXTLOAD;
       return true;
     }
     return false;
   }
   case ISD::AssertZext: {
     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
     if ((TypeNode->getVT() == MVT::i8 && width == 8)
        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
       ExtType = ISD::ZEXTLOAD;
       return true;
     }
     return false;
   }
   case ISD::Constant:
   case ISD::TargetConstant: {
     return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
            1LL << (width - 1);
   }
   }
 
   return true;
 }
 
 // This function does a whole lot of voodoo to determine if the tests are
 // equivalent without and with a mask. Essentially what happens is that given a
 // DAG resembling:
 //
 //  +-------------+ +-------------+ +-------------+ +-------------+
 //  |    Input    | | AddConstant | | CompConstant| |     CC      |
 //  +-------------+ +-------------+ +-------------+ +-------------+
 //           |           |           |               |
 //           V           V           |    +----------+
 //          +-------------+  +----+  |    |
 //          |     ADD     |  |0xff|  |    |
 //          +-------------+  +----+  |    |
 //                  |           |    |    |
 //                  V           V    |    |
 //                 +-------------+   |    |
 //                 |     AND     |   |    |
 //                 +-------------+   |    |
 //                      |            |    |
 //                      +-----+      |    |
 //                            |      |    |
 //                            V      V    V
 //                           +-------------+
 //                           |     CMP     |
 //                           +-------------+
 //
 // The AND node may be safely removed for some combinations of inputs. In
 // particular we need to take into account the extension type of the Input,
 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
 // width of the input (this can work for any width inputs, the above graph is
 // specific to 8 bits.
 //
 // The specific equations were worked out by generating output tables for each
 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
 // problem was simplified by working with 4 bit inputs, which means we only
 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
 // patterns present in both extensions (0,7). For every distinct set of
 // AddConstant and CompConstants bit patterns we can consider the masked and
 // unmasked versions to be equivalent if the result of this function is true for
 // all 16 distinct bit patterns of for the current extension type of Input (w0).
 //
 //   sub      w8, w0, w1
 //   and      w10, w8, #0x0f
 //   cmp      w8, w2
 //   cset     w9, AArch64CC
 //   cmp      w10, w2
 //   cset     w11, AArch64CC
 //   cmp      w9, w11
 //   cset     w0, eq
 //   ret
 //
 // Since the above function shows when the outputs are equivalent it defines
 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
 // would be expensive to run during compiles. The equations below were written
 // in a test harness that confirmed they gave equivalent outputs to the above
 // for all inputs function, so they can be used determine if the removal is
 // legal instead.
 //
 // isEquivalentMaskless() is the code for testing if the AND can be removed
 // factored out of the DAG recognition as the DAG can take several forms.
 
 static bool isEquivalentMaskless(unsigned CC, unsigned width,
                                  ISD::LoadExtType ExtType, int AddConstant,
                                  int CompConstant) {
   // By being careful about our equations and only writing the in term
   // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
   // make them generally applicable to all bit widths.
   int MaxUInt = (1 << width);
 
   // For the purposes of these comparisons sign extending the type is
   // equivalent to zero extending the add and displacing it by half the integer
   // width. Provided we are careful and make sure our equations are valid over
   // the whole range we can just adjust the input and avoid writing equations
   // for sign extended inputs.
   if (ExtType == ISD::SEXTLOAD)
     AddConstant -= (1 << (width-1));
 
   switch(CC) {
   case AArch64CC::LE:
   case AArch64CC::GT:
     if ((AddConstant == 0) ||
         (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
         (AddConstant >= 0 && CompConstant < 0) ||
         (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
       return true;
     break;
   case AArch64CC::LT:
   case AArch64CC::GE:
     if ((AddConstant == 0) ||
         (AddConstant >= 0 && CompConstant <= 0) ||
         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
       return true;
     break;
   case AArch64CC::HI:
   case AArch64CC::LS:
     if ((AddConstant >= 0 && CompConstant < 0) ||
        (AddConstant <= 0 && CompConstant >= -1 &&
         CompConstant < AddConstant + MaxUInt))
       return true;
    break;
   case AArch64CC::PL:
   case AArch64CC::MI:
     if ((AddConstant == 0) ||
         (AddConstant > 0 && CompConstant <= 0) ||
         (AddConstant < 0 && CompConstant <= AddConstant))
       return true;
     break;
   case AArch64CC::LO:
   case AArch64CC::HS:
     if ((AddConstant >= 0 && CompConstant <= 0) ||
         (AddConstant <= 0 && CompConstant >= 0 &&
          CompConstant <= AddConstant + MaxUInt))
       return true;
     break;
   case AArch64CC::EQ:
   case AArch64CC::NE:
     if ((AddConstant > 0 && CompConstant < 0) ||
         (AddConstant < 0 && CompConstant >= 0 &&
          CompConstant < AddConstant + MaxUInt) ||
         (AddConstant >= 0 && CompConstant >= 0 &&
          CompConstant >= AddConstant) ||
         (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
       return true;
     break;
   case AArch64CC::VS:
   case AArch64CC::VC:
   case AArch64CC::AL:
   case AArch64CC::NV:
     return true;
   case AArch64CC::Invalid:
     break;
   }
 
   return false;
 }
 
 static
 SDValue performCONDCombine(SDNode *N,
                            TargetLowering::DAGCombinerInfo &DCI,
                            SelectionDAG &DAG, unsigned CCIndex,
                            unsigned CmpIndex) {
   unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
   unsigned CondOpcode = SubsNode->getOpcode();
 
   if (CondOpcode != AArch64ISD::SUBS)
     return SDValue();
 
   // There is a SUBS feeding this condition. Is it fed by a mask we can
   // use?
 
   SDNode *AndNode = SubsNode->getOperand(0).getNode();
   unsigned MaskBits = 0;
 
   if (AndNode->getOpcode() != ISD::AND)
     return SDValue();
 
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
     uint32_t CNV = CN->getZExtValue();
     if (CNV == 255)
       MaskBits = 8;
     else if (CNV == 65535)
       MaskBits = 16;
   }
 
   if (!MaskBits)
     return SDValue();
 
   SDValue AddValue = AndNode->getOperand(0);
 
   if (AddValue.getOpcode() != ISD::ADD)
     return SDValue();
 
   // The basic dag structure is correct, grab the inputs and validate them.
 
   SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
   SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
   SDValue SubsInputValue = SubsNode->getOperand(1);
 
   // The mask is present and the provenance of all the values is a smaller type,
   // lets see if the mask is superfluous.
 
   if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
       !isa<ConstantSDNode>(SubsInputValue.getNode()))
     return SDValue();
 
   ISD::LoadExtType ExtType;
 
   if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
       !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
       !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
     return SDValue();
 
   if(!isEquivalentMaskless(CC, MaskBits, ExtType,
                 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
                 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
     return SDValue();
 
   // The AND is not necessary, remove it.
 
   SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
                                SubsNode->getValueType(1));
   SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
 
   SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
   DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
 
   return SDValue(N, 0);
 }
 
 // Optimize compare with zero and branch.
 static SDValue performBRCONDCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
   if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
     N = NV.getNode();
   SDValue Chain = N->getOperand(0);
   SDValue Dest = N->getOperand(1);
   SDValue CCVal = N->getOperand(2);
   SDValue Cmp = N->getOperand(3);
 
   assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
   unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
   if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
     return SDValue();
 
   unsigned CmpOpc = Cmp.getOpcode();
   if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
     return SDValue();
 
   // Only attempt folding if there is only one use of the flag and no use of the
   // value.
   if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
     return SDValue();
 
   SDValue LHS = Cmp.getOperand(0);
   SDValue RHS = Cmp.getOperand(1);
 
   assert(LHS.getValueType() == RHS.getValueType() &&
          "Expected the value type to be the same for both operands!");
   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
     return SDValue();
 
   if (isNullConstant(LHS))
     std::swap(LHS, RHS);
 
   if (!isNullConstant(RHS))
     return SDValue();
 
   if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
       LHS.getOpcode() == ISD::SRL)
     return SDValue();
 
   // Fold the compare into the branch instruction.
   SDValue BR;
   if (CC == AArch64CC::EQ)
     BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
   else
     BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
 
   // Do not add new nodes to DAG combiner worklist.
   DCI.CombineTo(N, BR, false);
 
   return SDValue();
 }
 
 // Optimize some simple tbz/tbnz cases.  Returns the new operand and bit to test
 // as well as whether the test should be inverted.  This code is required to
 // catch these cases (as opposed to standard dag combines) because
 // AArch64ISD::TBZ is matched during legalization.
 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
                                  SelectionDAG &DAG) {
 
   if (!Op->hasOneUse())
     return Op;
 
   // We don't handle undef/constant-fold cases below, as they should have
   // already been taken care of (e.g. and of 0, test of undefined shifted bits,
   // etc.)
 
   // (tbz (trunc x), b) -> (tbz x, b)
   // This case is just here to enable more of the below cases to be caught.
   if (Op->getOpcode() == ISD::TRUNCATE &&
       Bit < Op->getValueType(0).getSizeInBits()) {
     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
   }
 
   if (Op->getNumOperands() != 2)
     return Op;
 
   auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
   if (!C)
     return Op;
 
   switch (Op->getOpcode()) {
   default:
     return Op;
 
   // (tbz (and x, m), b) -> (tbz x, b)
   case ISD::AND:
     if ((C->getZExtValue() >> Bit) & 1)
       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
     return Op;
 
   // (tbz (shl x, c), b) -> (tbz x, b-c)
   case ISD::SHL:
     if (C->getZExtValue() <= Bit &&
         (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
       Bit = Bit - C->getZExtValue();
       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
     }
     return Op;
 
   // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
   case ISD::SRA:
     Bit = Bit + C->getZExtValue();
     if (Bit >= Op->getValueType(0).getSizeInBits())
       Bit = Op->getValueType(0).getSizeInBits() - 1;
     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
 
   // (tbz (srl x, c), b) -> (tbz x, b+c)
   case ISD::SRL:
     if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
       Bit = Bit + C->getZExtValue();
       return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
     }
     return Op;
 
   // (tbz (xor x, -1), b) -> (tbnz x, b)
   case ISD::XOR:
     if ((C->getZExtValue() >> Bit) & 1)
       Invert = !Invert;
     return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
   }
 }
 
 // Optimize test single bit zero/non-zero and branch.
 static SDValue performTBZCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  SelectionDAG &DAG) {
   unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
   bool Invert = false;
   SDValue TestSrc = N->getOperand(1);
   SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
 
   if (TestSrc == NewTestSrc)
     return SDValue();
 
   unsigned NewOpc = N->getOpcode();
   if (Invert) {
     if (NewOpc == AArch64ISD::TBZ)
       NewOpc = AArch64ISD::TBNZ;
     else {
       assert(NewOpc == AArch64ISD::TBNZ);
       NewOpc = AArch64ISD::TBZ;
     }
   }
 
   SDLoc DL(N);
   return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
                      DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
 }
 
 // vselect (v1i1 setcc) ->
 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
 // such VSELECT.
 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   EVT CCVT = N0.getValueType();
 
   if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
       CCVT.getVectorElementType() != MVT::i1)
     return SDValue();
 
   EVT ResVT = N->getValueType(0);
   EVT CmpVT = N0.getOperand(0).getValueType();
   // Only combine when the result type is of the same size as the compared
   // operands.
   if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
     return SDValue();
 
   SDValue IfTrue = N->getOperand(1);
   SDValue IfFalse = N->getOperand(2);
   SDValue SetCC =
       DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
                    N0.getOperand(0), N0.getOperand(1),
                    cast<CondCodeSDNode>(N0.getOperand(2))->get());
   return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
                      IfTrue, IfFalse);
 }
 
 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
 /// the compare-mask instructions rather than going via NZCV, even if LHS and
 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
 /// with a vector one followed by a DUP shuffle on the result.
 static SDValue performSelectCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
   SDValue N0 = N->getOperand(0);
   EVT ResVT = N->getValueType(0);
 
   if (N0.getOpcode() != ISD::SETCC)
     return SDValue();
 
   // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
   // scalar SetCCResultType. We also don't expect vectors, because we assume
   // that selects fed by vector SETCCs are canonicalized to VSELECT.
   assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
          "Scalar-SETCC feeding SELECT has unexpected result type!");
 
   // If NumMaskElts == 0, the comparison is larger than select result. The
   // largest real NEON comparison is 64-bits per lane, which means the result is
   // at most 32-bits and an illegal vector. Just bail out for now.
   EVT SrcVT = N0.getOperand(0).getValueType();
 
   // Don't try to do this optimization when the setcc itself has i1 operands.
   // There are no legal vectors of i1, so this would be pointless.
   if (SrcVT == MVT::i1)
     return SDValue();
 
   int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
   if (!ResVT.isVector() || NumMaskElts == 0)
     return SDValue();
 
   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
 
   // Also bail out if the vector CCVT isn't the same size as ResVT.
   // This can happen if the SETCC operand size doesn't divide the ResVT size
   // (e.g., f64 vs v3f32).
   if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
     return SDValue();
 
   // Make sure we didn't create illegal types, if we're not supposed to.
   assert(DCI.isBeforeLegalize() ||
          DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
 
   // First perform a vector comparison, where lane 0 is the one we're interested
   // in.
   SDLoc DL(N0);
   SDValue LHS =
       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
   SDValue RHS =
       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
 
   // Now duplicate the comparison mask we want across all other lanes.
   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
   Mask = DAG.getNode(ISD::BITCAST, DL,
                      ResVT.changeVectorElementTypeToInteger(), Mask);
 
   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
 }
 
 /// Get rid of unnecessary NVCASTs (that don't change the type).
 static SDValue performNVCASTCombine(SDNode *N) {
   if (N->getValueType(0) == N->getOperand(0).getValueType())
     return N->getOperand(0);
 
   return SDValue();
 }
 
 // If all users of the globaladdr are of the form (globaladdr + constant), find
 // the smallest constant, fold it into the globaladdr's offset and rewrite the
 // globaladdr as (globaladdr + constant) - constant.
 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
                                            const AArch64Subtarget *Subtarget,
                                            const TargetMachine &TM) {
   auto *GN = dyn_cast<GlobalAddressSDNode>(N);
   if (!GN || Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
                  AArch64II::MO_NO_FLAG)
     return SDValue();
 
   uint64_t MinOffset = -1ull;
   for (SDNode *N : GN->uses()) {
     if (N->getOpcode() != ISD::ADD)
       return SDValue();
     auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
     if (!C)
       C = dyn_cast<ConstantSDNode>(N->getOperand(1));
     if (!C)
       return SDValue();
     MinOffset = std::min(MinOffset, C->getZExtValue());
   }
   uint64_t Offset = MinOffset + GN->getOffset();
 
   // Require that the new offset is larger than the existing one. Otherwise, we
   // can end up oscillating between two possible DAGs, for example,
   // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
   if (Offset <= uint64_t(GN->getOffset()))
     return SDValue();
 
   // Check whether folding this offset is legal. It must not go out of bounds of
   // the referenced object to avoid violating the code model, and must be
   // smaller than 2^21 because this is the largest offset expressible in all
   // object formats.
   //
   // This check also prevents us from folding negative offsets, which will end
   // up being treated in the same way as large positive ones. They could also
   // cause code model violations, and aren't really common enough to matter.
   if (Offset >= (1 << 21))
     return SDValue();
 
   const GlobalValue *GV = GN->getGlobal();
   Type *T = GV->getValueType();
   if (!T->isSized() ||
       Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
     return SDValue();
 
   SDLoc DL(GN);
   SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
   return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
                      DAG.getConstant(MinOffset, DL, MVT::i64));
 }
 
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
   default:
     LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
     break;
   case ISD::ADD:
   case ISD::SUB:
     return performAddSubLongCombine(N, DCI, DAG);
   case ISD::XOR:
     return performXorCombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
     return performMulCombine(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
     return performIntToFpCombine(N, DAG, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return performFpToIntCombine(N, DAG, DCI, Subtarget);
   case ISD::FDIV:
     return performFDivCombine(N, DAG, DCI, Subtarget);
   case ISD::OR:
     return performORCombine(N, DCI, Subtarget);
   case ISD::SRL:
     return performSRLCombine(N, DCI);
   case ISD::INTRINSIC_WO_CHAIN:
     return performIntrinsicCombine(N, DCI, Subtarget);
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::SIGN_EXTEND:
     return performExtendCombine(N, DCI, DAG);
   case ISD::BITCAST:
     return performBitcastCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
   case ISD::SELECT:
     return performSelectCombine(N, DCI);
   case ISD::VSELECT:
     return performVSelectCombine(N, DCI.DAG);
   case ISD::LOAD:
     if (performTBISimplification(N->getOperand(1), DCI, DAG))
       return SDValue(N, 0);
     break;
   case ISD::STORE:
     return performSTORECombine(N, DCI, DAG, Subtarget);
   case AArch64ISD::BRCOND:
     return performBRCONDCombine(N, DCI, DAG);
   case AArch64ISD::TBNZ:
   case AArch64ISD::TBZ:
     return performTBZCombine(N, DCI, DAG);
   case AArch64ISD::CSEL:
     return performCONDCombine(N, DCI, DAG, 2, 3);
   case AArch64ISD::DUP:
     return performPostLD1Combine(N, DCI, false);
   case AArch64ISD::NVCAST:
     return performNVCASTCombine(N);
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
     case Intrinsic::aarch64_neon_ld2:
     case Intrinsic::aarch64_neon_ld3:
     case Intrinsic::aarch64_neon_ld4:
     case Intrinsic::aarch64_neon_ld1x2:
     case Intrinsic::aarch64_neon_ld1x3:
     case Intrinsic::aarch64_neon_ld1x4:
     case Intrinsic::aarch64_neon_ld2lane:
     case Intrinsic::aarch64_neon_ld3lane:
     case Intrinsic::aarch64_neon_ld4lane:
     case Intrinsic::aarch64_neon_ld2r:
     case Intrinsic::aarch64_neon_ld3r:
     case Intrinsic::aarch64_neon_ld4r:
     case Intrinsic::aarch64_neon_st2:
     case Intrinsic::aarch64_neon_st3:
     case Intrinsic::aarch64_neon_st4:
     case Intrinsic::aarch64_neon_st1x2:
     case Intrinsic::aarch64_neon_st1x3:
     case Intrinsic::aarch64_neon_st1x4:
     case Intrinsic::aarch64_neon_st2lane:
     case Intrinsic::aarch64_neon_st3lane:
     case Intrinsic::aarch64_neon_st4lane:
       return performNEONPostLDSTCombine(N, DCI, DAG);
     default:
       break;
     }
   case ISD::GlobalAddress:
     return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
   }
   return SDValue();
 }
 
 // Check if the return value is used as only a return value, as otherwise
 // we can't perform a tail-call. In particular, we need to check for
 // target ISD nodes that are returns and any other "odd" constructs
 // that the generic analysis code won't necessarily catch.
 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
                                                SDValue &Chain) const {
   if (N->getNumValues() != 1)
     return false;
   if (!N->hasNUsesOfValue(1, 0))
     return false;
 
   SDValue TCChain = Chain;
   SDNode *Copy = *N->use_begin();
   if (Copy->getOpcode() == ISD::CopyToReg) {
     // If the copy has a glue operand, we conservatively assume it isn't safe to
     // perform a tail call.
     if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
         MVT::Glue)
       return false;
     TCChain = Copy->getOperand(0);
   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
     return false;
 
   bool HasRet = false;
   for (SDNode *Node : Copy->uses()) {
     if (Node->getOpcode() != AArch64ISD::RET_FLAG)
       return false;
     HasRet = true;
   }
 
   if (!HasRet)
     return false;
 
   Chain = TCChain;
   return true;
 }
 
 // Return whether the an instruction can potentially be optimized to a tail
 // call. This will cause the optimizers to attempt to move, or duplicate,
 // return instructions to help enable tail call optimizations for this
 // instruction.
 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   return CI->isTailCall();
 }
 
 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
                                                    SDValue &Offset,
                                                    ISD::MemIndexedMode &AM,
                                                    bool &IsInc,
                                                    SelectionDAG &DAG) const {
   if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
     return false;
 
   Base = Op->getOperand(0);
   // All of the indexed addressing mode instructions take a signed
   // 9 bit immediate offset.
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
     int64_t RHSC = RHS->getSExtValue();
     if (Op->getOpcode() == ISD::SUB)
       RHSC = -(uint64_t)RHSC;
     if (!isInt<9>(RHSC))
       return false;
     IsInc = (Op->getOpcode() == ISD::ADD);
     Offset = Op->getOperand(1);
     return true;
   }
   return false;
 }
 
 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
                                                       SDValue &Offset,
                                                       ISD::MemIndexedMode &AM,
                                                       SelectionDAG &DAG) const {
   EVT VT;
   SDValue Ptr;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
   } else
     return false;
 
   bool IsInc;
   if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
     return false;
   AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
   return true;
 }
 
 bool AArch64TargetLowering::getPostIndexedAddressParts(
     SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
     ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
   EVT VT;
   SDValue Ptr;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
   } else
     return false;
 
   bool IsInc;
   if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
     return false;
   // Post-indexing updates the base, so it's not a valid transform
   // if that's not the same as the load's pointer.
   if (Ptr != Base)
     return false;
   AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
   return true;
 }
 
 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                   SelectionDAG &DAG) {
   SDLoc DL(N);
   SDValue Op = N->getOperand(0);
 
   if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
     return;
 
   Op = SDValue(
       DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
                          DAG.getUNDEF(MVT::i32), Op,
                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
       0);
   Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
 }
 
 static void ReplaceReductionResults(SDNode *N,
                                     SmallVectorImpl<SDValue> &Results,
                                     SelectionDAG &DAG, unsigned InterOp,
                                     unsigned AcrossOp) {
   EVT LoVT, HiVT;
   SDValue Lo, Hi;
   SDLoc dl(N);
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
   SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
   SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
   Results.push_back(SplitVal);
 }
 
 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
   SDLoc DL(N);
   SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
   SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
                            DAG.getNode(ISD::SRL, DL, MVT::i128, N,
                                        DAG.getConstant(64, DL, MVT::i64)));
   return std::make_pair(Lo, Hi);
 }
 
 // Create an even/odd pair of X registers holding integer value V.
 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
   SDLoc dl(V.getNode());
   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
   SDValue VHi = DAG.getAnyExtOrTrunc(
       DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
       dl, MVT::i64);
   if (DAG.getDataLayout().isBigEndian())
     std::swap (VLo, VHi);
   SDValue RegClass =
       DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
   SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
   SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
   return SDValue(
       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
 }
 
 static void ReplaceCMP_SWAP_128Results(SDNode *N,
                                        SmallVectorImpl<SDValue> &Results,
                                        SelectionDAG &DAG,
                                        const AArch64Subtarget *Subtarget) {
   assert(N->getValueType(0) == MVT::i128 &&
          "AtomicCmpSwap on types less than 128 should be legal");
 
   if (Subtarget->hasLSE()) {
     // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
     // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
     SDValue Ops[] = {
         createGPRPairNode(DAG, N->getOperand(2)), // Compare value
         createGPRPairNode(DAG, N->getOperand(3)), // Store value
         N->getOperand(1), // Ptr
         N->getOperand(0), // Chain in
     };
 
     MachineFunction &MF = DAG.getMachineFunction();
     MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
     MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
 
     unsigned Opcode;
     switch (MemOp[0]->getOrdering()) {
     case AtomicOrdering::Monotonic:
       Opcode = AArch64::CASPX;
       break;
     case AtomicOrdering::Acquire:
       Opcode = AArch64::CASPAX;
       break;
     case AtomicOrdering::Release:
       Opcode = AArch64::CASPLX;
       break;
     case AtomicOrdering::AcquireRelease:
     case AtomicOrdering::SequentiallyConsistent:
       Opcode = AArch64::CASPALX;
       break;
     default:
       llvm_unreachable("Unexpected ordering!");
     }
 
     MachineSDNode *CmpSwap = DAG.getMachineNode(
         Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
     CmpSwap->setMemRefs(MemOp, MemOp + 1);
 
     unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
     if (DAG.getDataLayout().isBigEndian())
       std::swap(SubReg1, SubReg2);
     Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
                                                  SDValue(CmpSwap, 0)));
     Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
                                                  SDValue(CmpSwap, 0)));
     Results.push_back(SDValue(CmpSwap, 1)); // Chain out
     return;
   }
 
   auto Desired = splitInt128(N->getOperand(2), DAG);
   auto New = splitInt128(N->getOperand(3), DAG);
   SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
                    New.first,        New.second,    N->getOperand(0)};
   SDNode *CmpSwap = DAG.getMachineNode(
       AArch64::CMP_SWAP_128, SDLoc(N),
       DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
 
   Results.push_back(SDValue(CmpSwap, 0));
   Results.push_back(SDValue(CmpSwap, 1));
   Results.push_back(SDValue(CmpSwap, 3));
 }
 
 void AArch64TargetLowering::ReplaceNodeResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Don't know how to custom expand this");
   case ISD::BITCAST:
     ReplaceBITCASTResults(N, Results, DAG);
     return;
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_SMAX:
   case ISD::VECREDUCE_SMIN:
   case ISD::VECREDUCE_UMAX:
   case ISD::VECREDUCE_UMIN:
     Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
     return;
 
   case AArch64ISD::SADDV:
     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
     return;
   case AArch64ISD::UADDV:
     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
     return;
   case AArch64ISD::SMINV:
     ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
     return;
   case AArch64ISD::UMINV:
     ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
     return;
   case AArch64ISD::SMAXV:
     ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
     return;
   case AArch64ISD::UMAXV:
     ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
     return;
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:
     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
     // Let normal code take care of it by not adding anything to Results.
     return;
   case ISD::ATOMIC_CMP_SWAP:
     ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
     return;
   }
 }
 
 bool AArch64TargetLowering::useLoadStackGuardNode() const {
   if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
     return TargetLowering::useLoadStackGuardNode();
   return true;
 }
 
 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
   // Combine multiple FDIVs with the same divisor into multiple FMULs by the
   // reciprocal if there are three or more FDIVs.
   return 3;
 }
 
 TargetLoweringBase::LegalizeTypeAction
 AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
   MVT SVT = VT.getSimpleVT();
   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
   // v4i16, v2i32 instead of to promote.
   if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
       || SVT == MVT::v1f32)
     return TypeWidenVector;
 
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
 // Loads and stores less than 128-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong.
 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
   return Size == 128;
 }
 
 // Loads and stores less than 128-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong.
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
   return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
 }
 
 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
 TargetLowering::AtomicExpansionKind
 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
   if (Size > 128) return AtomicExpansionKind::None;
   // Nand not supported in LSE.
   if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
   // Leave 128 bits to LLSC.
   return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
 }
 
 bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
   // If subtarget has LSE, leave cmpxchg intact for codegen.
   if (Subtarget->hasLSE()) return false;
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
   // implement cmpxchg without spilling. If the address being exchanged is also
   // on the stack and close enough to the spill slot, this can lead to a
   // situation where the monitor always gets cleared and the atomic operation
   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
   return getTargetMachine().getOptLevel() != 0;
 }
 
 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                              AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
   bool IsAcquire = isAcquireOrStronger(Ord);
 
   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i64, i64} and we have to recombine them into a
   // single i128 here.
   if (ValTy->getPrimitiveSizeInBits() == 128) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
     Function *Ldxr = Intrinsic::getDeclaration(M, Int);
 
     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
 
     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
     return Builder.CreateOr(
         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
   }
 
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int =
       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
   Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
 
   return Builder.CreateTruncOrBitCast(
       Builder.CreateCall(Ldxr, Addr),
       cast<PointerType>(Addr->getType())->getElementType());
 }
 
 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilder<> &Builder) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
 }
 
 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                                                    Value *Val, Value *Addr,
                                                    AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   bool IsRelease = isReleaseOrStronger(Ord);
 
   // Since the intrinsics must have legal type, the i128 intrinsics take two
   // parameters: "i64, i64". We must marshal Val into the appropriate form
   // before the call.
   if (Val->getType()->getPrimitiveSizeInBits() == 128) {
     Intrinsic::ID Int =
         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
     Function *Stxr = Intrinsic::getDeclaration(M, Int);
     Type *Int64Ty = Type::getInt64Ty(M->getContext());
 
     Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
     return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
   }
 
   Intrinsic::ID Int =
       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
   Type *Tys[] = { Addr->getType() };
   Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
 
   return Builder.CreateCall(Stxr,
                             {Builder.CreateZExtOrBitCast(
                                  Val, Stxr->getFunctionType()->getParamType(0)),
                              Addr});
 }
 
 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
   return Ty->isArrayTy();
 }
 
 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
                                                             EVT) const {
   return false;
 }
 
 static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   Function *ThreadPointerFunc =
       Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
   return IRB.CreatePointerCast(
       IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
 }
 
 Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
   // Android provides a fixed TLS slot for the stack cookie. See the definition
   // of TLS_SLOT_STACK_GUARD in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
   if (Subtarget->isTargetAndroid())
     return UseTlsOffset(IRB, 0x28);
 
   // Fuchsia is similar.
   // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
   if (Subtarget->isTargetFuchsia())
     return UseTlsOffset(IRB, -0x10);
 
   return TargetLowering::getIRStackGuard(IRB);
 }
 
 Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
   if (Subtarget->isTargetAndroid())
     return UseTlsOffset(IRB, 0x48);
 
   // Fuchsia is similar.
   // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
   if (Subtarget->isTargetFuchsia())
     return UseTlsOffset(IRB, -0x8);
 
   return TargetLowering::getSafeStackPointerLocation(IRB);
 }
 
 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
     const Instruction &AndI) const {
   // Only sink 'and' mask to cmp use block if it is masking a single bit, since
   // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
   // may be beneficial to sink in other cases, but we would have to check that
   // the cmp would not get folded into the br to form a cbz for these to be
   // beneficial.
   ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
   if (!Mask)
     return false;
   return Mask->getValue().isPowerOf2();
 }
 
 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
   // Update IsSplitCSR in AArch64unctionInfo.
   AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
   AFI->setIsSplitCSR(true);
 }
 
 void AArch64TargetLowering::insertCopiesSplitCSR(
     MachineBasicBlock *Entry,
     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
   const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
   if (!IStart)
     return;
 
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
   MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (AArch64::GPR64RegClass.contains(*I))
       RC = &AArch64::GPR64RegClass;
     else if (AArch64::FPR64RegClass.contains(*I))
       RC = &AArch64::FPR64RegClass;
     else
       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
 
     unsigned NewVR = MRI->createVirtualRegister(RC);
     // Create copy from CSR to a virtual register.
     // FIXME: this currently does not emit CFI pseudo-instructions, it works
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
     // nounwind. If we want to generalize this later, we may need to emit
     // CFI pseudo-instructions.
     assert(Entry->getParent()->getFunction().hasFnAttribute(
                Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
     // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
               TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
 }
 
 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // Integer division on AArch64 is expensive. However, when aggressively
   // optimizing for code size, we prefer to use a div instruction, as it is
   // usually smaller than the alternative sequence.
   // The exception to this is vector division. Since AArch64 doesn't have vector
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
   bool OptSize =
       Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
   return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
 }
 
 unsigned
 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
   if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
     return getPointerTy(DL).getSizeInBits();
 
   return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
 }
 
 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
   MF.getFrameInfo().computeMaxCallFrameSize(MF);
   TargetLoweringBase::finalizeLowering(MF);
 }
Index: head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h	(revision 344055)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h	(revision 344056)
@@ -1,711 +1,712 @@
 //==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the interfaces that AArch64 uses to lower LLVM code into a
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
 
 #include "AArch64.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Instruction.h"
 
 namespace llvm {
 
 namespace AArch64ISD {
 
 enum NodeType : unsigned {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
   WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
   CALL,         // Function call.
 
   // Produces the full sequence of instructions for getting the thread pointer
   // offset of a variable into X0, using the TLSDesc model.
   TLSDESC_CALLSEQ,
   ADRP,     // Page address of a TargetGlobalAddress operand.
   ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
   LOADgot,  // Load from automatically generated descriptor (e.g. Global
             // Offset Table, TLS record).
   RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
   BRCOND,   // Conditional branch instruction; "b.cond".
   CSEL,
   FCSEL, // Conditional move instruction.
   CSINV, // Conditional select invert.
   CSNEG, // Conditional select negate.
   CSINC, // Conditional select increment.
 
   // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
   // ELF.
   THREAD_POINTER,
   ADC,
   SBC, // adc, sbc instructions
 
   // Arithmetic instructions which write flags.
   ADDS,
   SUBS,
   ADCS,
   SBCS,
   ANDS,
 
   // Conditional compares. Operands: left,right,falsecc,cc,flags
   CCMP,
   CCMN,
   FCCMP,
 
   // Floating point comparison
   FCMP,
 
   // Scalar extract
   EXTR,
 
   // Scalar-to-vector duplication
   DUP,
   DUPLANE8,
   DUPLANE16,
   DUPLANE32,
   DUPLANE64,
 
   // Vector immedate moves
   MOVI,
   MOVIshift,
   MOVIedit,
   MOVImsl,
   FMOV,
   MVNIshift,
   MVNImsl,
 
   // Vector immediate ops
   BICi,
   ORRi,
 
   // Vector bit select: similar to ISD::VSELECT but not all bits within an
   // element must be identical.
   BSL,
 
   // Vector arithmetic negation
   NEG,
 
   // Vector shuffles
   ZIP1,
   ZIP2,
   UZP1,
   UZP2,
   TRN1,
   TRN2,
   REV16,
   REV32,
   REV64,
   EXT,
 
   // Vector shift by scalar
   VSHL,
   VLSHR,
   VASHR,
 
   // Vector shift by scalar (again)
   SQSHL_I,
   UQSHL_I,
   SQSHLU_I,
   SRSHR_I,
   URSHR_I,
 
   // Vector comparisons
   CMEQ,
   CMGE,
   CMGT,
   CMHI,
   CMHS,
   FCMEQ,
   FCMGE,
   FCMGT,
 
   // Vector zero comparisons
   CMEQz,
   CMGEz,
   CMGTz,
   CMLEz,
   CMLTz,
   FCMEQz,
   FCMGEz,
   FCMGTz,
   FCMLEz,
   FCMLTz,
 
   // Vector across-lanes addition
   // Only the lower result lane is defined.
   SADDV,
   UADDV,
 
   // Vector across-lanes min/max
   // Only the lower result lane is defined.
   SMINV,
   UMINV,
   SMAXV,
   UMAXV,
 
   // Vector bitwise negation
   NOT,
 
   // Vector bitwise selection
   BIT,
 
   // Compare-and-branch
   CBZ,
   CBNZ,
   TBZ,
   TBNZ,
 
   // Tail calls
   TC_RETURN,
 
   // Custom prefetch handling
   PREFETCH,
 
   // {s|u}int to FP within a FP register.
   SITOF,
   UITOF,
 
   /// Natural vector cast. ISD::BITCAST is not natural in the big-endian
   /// world w.r.t vectors; which causes additional REV instructions to be
   /// generated to compensate for the byte-swapping. But sometimes we do
   /// need to re-interpret the data in SIMD vector registers in big-endian
   /// mode without emitting such REV instructions.
   NVCAST,
 
   SMULL,
   UMULL,
 
   // Reciprocal estimates and steps.
   FRECPE, FRECPS,
   FRSQRTE, FRSQRTS,
 
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LD3post,
   LD4post,
   ST2post,
   ST3post,
   ST4post,
   LD1x2post,
   LD1x3post,
   LD1x4post,
   ST1x2post,
   ST1x3post,
   ST1x4post,
   LD1DUPpost,
   LD2DUPpost,
   LD3DUPpost,
   LD4DUPpost,
   LD1LANEpost,
   LD2LANEpost,
   LD3LANEpost,
   LD4LANEpost,
   ST2LANEpost,
   ST3LANEpost,
   ST4LANEpost
 };
 
 } // end namespace AArch64ISD
 
 namespace {
 
 // Any instruction that defines a 32-bit result zeros out the high half of the
 // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
 // be copying from a truncate. But any other 32-bit operation will zero-extend
 // up to 64 bits.
 // FIXME: X86 also checks for CMOV here. Do we need something similar?
 static inline bool isDef32(const SDNode &N) {
   unsigned Opc = N.getOpcode();
   return Opc != ISD::TRUNCATE && Opc != TargetOpcode::EXTRACT_SUBREG &&
          Opc != ISD::CopyFromReg;
 }
 
 } // end anonymous namespace
 
 class AArch64Subtarget;
 class AArch64TargetMachine;
 
 class AArch64TargetLowering : public TargetLowering {
 public:
   explicit AArch64TargetLowering(const TargetMachine &TM,
                                  const AArch64Subtarget &STI);
 
   /// Selects the correct CCAssignFn for a given CallingConvention value.
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
 
   /// Selects the correct CCAssignFn for a given CallingConvention value.
   CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const;
 
   /// Determine which of the bits specified in Mask are known to be either zero
   /// or one and return them in the KnownZero/KnownOne bitsets.
   void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
                                      const APInt &DemandedElts,
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
   bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
                                     TargetLoweringOpt &TLO) const override;
 
   MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
 
   /// Returns true if the target allows unaligned memory accesses of the
   /// specified type.
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
                                       unsigned Align = 1,
                                       bool *Fast = nullptr) const override;
 
   /// Provide custom lowering hooks for some operations.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   const char *getTargetNodeName(unsigned Opcode) const override;
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
   /// Returns true if a cast between SrcAS and DestAS is a noop.
   bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
     // Addrspacecasts are always noops.
     return true;
   }
 
   /// This method returns a target specific FastISel object, or null if the
   /// target does not support "fast" ISel.
   FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                            const TargetLibraryInfo *libInfo) const override;
 
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
   /// Return true if the given shuffle mask can be codegen'd directly, or if it
   /// should be stack expanded.
   bool isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
 
   /// Return the ISD::SETCC ValueType.
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                          EVT VT) const override;
 
   SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
   MachineBasicBlock *EmitF128CSEL(MachineInstr &MI,
                                   MachineBasicBlock *BB) const;
 
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
 
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
                           MachineFunction &MF,
                           unsigned Intrinsic) const override;
 
   bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
                              EVT NewVT) const override;
 
   bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
   bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
   bool isProfitableToHoist(Instruction *I) const override;
 
   bool isZExtFree(Type *Ty1, Type *Ty2) const override;
   bool isZExtFree(EVT VT1, EVT VT2) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
 
   bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
 
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
   bool lowerInterleavedLoad(LoadInst *LI,
                             ArrayRef<ShuffleVectorInst *> Shuffles,
                             ArrayRef<unsigned> Indices,
                             unsigned Factor) const override;
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
   bool shouldConsiderGEPOffsetSplit() const override;
 
   EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
                           bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                           MachineFunction &MF) const override;
 
   /// Return true if the addressing mode represented by AM is legal for this
   /// target, for a load/store of the specified type.
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS,
                              Instruction *I = nullptr) const override;
 
   /// Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, it returns a negative value.
   int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                            unsigned AS) const override;
 
   /// Return true if an FMA operation is faster than a pair of fmul and fadd
   /// instructions. fmuladd intrinsics will be expanded to FMAs when this method
   /// returns true, otherwise fmuladd is expanded to fmul + fadd.
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
   const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
   /// Returns false if N is a bit extraction pattern of (X >> C) & Mask.
-  bool isDesirableToCommuteWithShift(const SDNode *N) const override;
+  bool isDesirableToCommuteWithShift(const SDNode *N,
+                                     CombineLevel Level) const override;
 
   /// Returns true if it is beneficial to convert a load of a constant
   /// to just the constant itself.
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override;
 
   /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
   /// with this index.
   bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                unsigned Index) const override;
 
   Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                               Value *Addr, AtomicOrdering Ord) const override;
 
   void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
 
   TargetLoweringBase::AtomicExpansionKind
   shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
   bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
   TargetLoweringBase::AtomicExpansionKind
   shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
   bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
 
   bool useLoadStackGuardNode() const override;
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
   /// If the target has a standard location for the stack protector cookie,
   /// returns the address of that location. Otherwise, returns nullptr.
   Value *getIRStackGuard(IRBuilder<> &IRB) const override;
 
   /// If the target has a standard location for the unsafe stack pointer,
   /// returns the address of that location. Otherwise, returns nullptr.
   Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
 
   /// If a physical register, this returns the register that receives the
   /// exception address on entry to an EH pad.
   unsigned
   getExceptionPointerRegister(const Constant *PersonalityFn) const override {
     // FIXME: This is a guess. Has this been defined yet?
     return AArch64::X0;
   }
 
   /// If a physical register, this returns the register that receives the
   /// exception typeid on entry to a landing pad.
   unsigned
   getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
     // FIXME: This is a guess. Has this been defined yet?
     return AArch64::X1;
   }
 
   bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
   bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
                         const SelectionDAG &DAG) const override {
     // Do not merge to float value size (128 bytes) if no implicit
     // float attribute is set.
 
     bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
         Attribute::NoImplicitFloat);
 
     if (NoFloat)
       return (MemVT.getSizeInBits() <= 64);
     return true;
   }
 
   bool isCheapToSpeculateCttz() const override {
     return true;
   }
 
   bool isCheapToSpeculateCtlz() const override {
     return true;
   }
 
   bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
 
   bool hasAndNotCompare(SDValue V) const override {
     // We can use bics for any scalar.
     return V.getValueType().isScalarInteger();
   }
 
   bool hasAndNot(SDValue Y) const override {
     EVT VT = Y.getValueType();
 
     if (!VT.isVector())
       return hasAndNotCompare(Y);
 
     return VT.getSizeInBits() >= 64; // vector 'bic'
   }
 
   bool shouldTransformSignedTruncationCheck(EVT XVT,
                                             unsigned KeptBits) const override {
     // For vectors, we don't have a preference..
     if (XVT.isVector())
       return false;
 
     auto VTIsOk = [](EVT VT) -> bool {
       return VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 ||
              VT == MVT::i64;
     };
 
     // We are ok with KeptBitsVT being byte/word/dword, what SXT supports.
     // XVT will be larger than KeptBitsVT.
     MVT KeptBitsVT = MVT::getIntegerVT(KeptBits);
     return VTIsOk(XVT) && VTIsOk(KeptBitsVT);
   }
 
   bool hasBitPreservingFPLogic(EVT VT) const override {
     // FIXME: Is this always true? It should be true for vectors at least.
     return VT == MVT::f32 || VT == MVT::f64;
   }
 
   bool supportSplitCSR(MachineFunction *MF) const override {
     return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
            MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
   }
   void initializeSplitCSR(MachineBasicBlock *Entry) const override;
   void insertCopiesSplitCSR(
       MachineBasicBlock *Entry,
       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
   bool supportSwiftError() const override {
     return true;
   }
 
   /// Enable aggressive FMA fusion on targets that want it.
   bool enableAggressiveFMAFusion(EVT VT) const override;
 
   /// Returns the size of the platform's va_list object.
   unsigned getVaListSizeInBits(const DataLayout &DL) const override;
 
   /// Returns true if \p VecTy is a legal interleaved access type. This
   /// function checks the vector element type and the overall width of the
   /// vector.
   bool isLegalInterleavedAccessType(VectorType *VecTy,
                                     const DataLayout &DL) const;
 
   /// Returns the number of interleaved accesses that will be generated when
   /// lowering accesses of the given type.
   unsigned getNumInterleavedAccesses(VectorType *VecTy,
                                      const DataLayout &DL) const;
 
   MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
 
   bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
                                                  CallingConv::ID CallConv,
                                                  bool isVarArg) const override;
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
 
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
   void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
   void addDRTypeForNEON(MVT VT);
   void addQRTypeForNEON(MVT VT);
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
                                const SDLoc &DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerCall(CallLoweringInfo & /*CLI*/,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                           CallingConv::ID CallConv, bool isVarArg,
                           const SmallVectorImpl<ISD::InputArg> &Ins,
                           const SDLoc &DL, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
                           SDValue ThisVal) const;
 
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
   bool isEligibleForTailCallOptimization(
       SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
       const SmallVectorImpl<ISD::OutputArg> &Outs,
       const SmallVectorImpl<SDValue> &OutVals,
       const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
 
   /// Finds the incoming stack arguments which overlap the given fixed stack
   /// object and incorporates their load into the current chain. This prevents
   /// an upcoming store from clobbering the stack argument before it's used.
   SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
                               MachineFrameInfo &MFI, int ClobberedFI) const;
 
   bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
 
   void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL,
                            SDValue &Chain) const;
 
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       LLVMContext &Context) const override;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
   SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
                         unsigned Flag) const;
   SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
                         unsigned Flag) const;
   SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
                         unsigned Flag) const;
   SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
                         unsigned Flag) const;
   template <class NodeTy>
   SDValue getGOT(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
   template <class NodeTy>
   SDValue getAddrLarge(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
   template <class NodeTy>
   SDValue getAddr(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
   SDValue LowerADDROFRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
                                  SelectionDAG &DAG) const;
   SDValue LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, SDValue RHS,
                          SDValue TVal, SDValue FVal, const SDLoc &dl,
                          SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
                         RTLIB::Libcall Call) const;
   SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
                                          SDValue &Size,
                                          SelectionDAG &DAG) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         SmallVectorImpl<SDNode *> &Created) const override;
   SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                           int &ExtraSteps, bool &UseOneConst,
                           bool Reciprocal) const override;
   SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                            int &ExtraSteps) const override;
   unsigned combineRepeatedFPDivisors() const override;
 
   ConstraintType getConstraintType(StringRef Constraint) const override;
   unsigned getRegisterByName(const char* RegName, EVT VT,
                              SelectionDAG &DAG) const override;
 
   /// Examine constraint string and operand type and determine a weight value.
   /// The operand object must already have been set up with the operand type.
   ConstraintWeight
   getSingleConstraintMatchWeight(AsmOperandInfo &info,
                                  const char *constraint) const override;
 
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
 
   const char *LowerXConstraint(EVT ConstraintVT) const override;
 
   void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
   unsigned getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
     if (ConstraintCode == "Q")
       return InlineAsm::Constraint_Q;
     // FIXME: clang has code for 'Ump', 'Utf', 'Usa', and 'Ush' but these are
     //        followed by llvm_unreachable so we'll leave them unimplemented in
     //        the backend for now.
     return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
   }
 
   bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
   bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
                               ISD::MemIndexedMode &AM, bool &IsInc,
                               SelectionDAG &DAG) const;
   bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
                                  ISD::MemIndexedMode &AM,
                                  SelectionDAG &DAG) const override;
   bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
                                   SDValue &Offset, ISD::MemIndexedMode &AM,
                                   SelectionDAG &DAG) const override;
 
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
 
   bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
 
   void finalizeLowering(MachineFunction &MF) const override;
 };
 
 namespace AArch64 {
 FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                          const TargetLibraryInfo *libInfo);
 } // end namespace AArch64
 
 } // end namespace llvm
 
 #endif
Index: head/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- head/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp	(revision 344055)
+++ head/contrib/llvm/lib/Target/ARM/ARMISelLowering.cpp	(revision 344056)
@@ -1,14994 +1,15011 @@
 //===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the interfaces that ARM uses to lower LLVM code into a
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//
 
 #include "ARMISelLowering.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMPerfectShuffle.h"
 #include "ARMRegisterInfo.h"
 #include "ARMSelectionDAGInfo.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "Utils/ARMBaseInfo.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
 #include <iterator>
 #include <limits>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "arm-isel"
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
 STATISTIC(NumConstpoolPromoted,
   "Number of constants with their storage promoted into constant pools");
 
 static cl::opt<bool>
 ARMInterworking("arm-interworking", cl::Hidden,
   cl::desc("Enable / disable ARM interworking (for debugging only)"),
   cl::init(true));
 
 static cl::opt<bool> EnableConstpoolPromotion(
     "arm-promote-constant", cl::Hidden,
     cl::desc("Enable / disable promotion of unnamed_addr constants into "
              "constant pools"),
     cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
     "arm-promote-constant-max-size", cl::Hidden,
     cl::desc("Maximum size of constant to promote into a constant pool"),
     cl::init(64));
 static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
     "arm-promote-constant-max-total", cl::Hidden,
     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
     cl::init(128));
 
 // The APCS parameter registers.
 static const MCPhysReg GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
 };
 
 void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
                                        MVT PromotedBitwiseVT) {
   if (VT != PromotedLdStVT) {
     setOperationAction(ISD::LOAD, VT, Promote);
     AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
 
     setOperationAction(ISD::STORE, VT, Promote);
     AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
   }
 
   MVT ElemTy = VT.getVectorElementType();
   if (ElemTy != MVT::f64)
     setOperationAction(ISD::SETCC, VT, Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
   if (ElemTy == MVT::i32) {
     setOperationAction(ISD::SINT_TO_FP, VT, Custom);
     setOperationAction(ISD::UINT_TO_FP, VT, Custom);
     setOperationAction(ISD::FP_TO_SINT, VT, Custom);
     setOperationAction(ISD::FP_TO_UINT, VT, Custom);
   } else {
     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
   }
   setOperationAction(ISD::BUILD_VECTOR,      VT, Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE,    VT, Custom);
   setOperationAction(ISD::CONCAT_VECTORS,    VT, Legal);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
   setOperationAction(ISD::SELECT,            VT, Expand);
   setOperationAction(ISD::SELECT_CC,         VT, Expand);
   setOperationAction(ISD::VSELECT,           VT, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
   if (VT.isInteger()) {
     setOperationAction(ISD::SHL, VT, Custom);
     setOperationAction(ISD::SRA, VT, Custom);
     setOperationAction(ISD::SRL, VT, Custom);
   }
 
   // Promote all bit-wise operations.
   if (VT.isInteger() && VT != PromotedBitwiseVT) {
     setOperationAction(ISD::AND, VT, Promote);
     AddPromotedToType (ISD::AND, VT, PromotedBitwiseVT);
     setOperationAction(ISD::OR,  VT, Promote);
     AddPromotedToType (ISD::OR,  VT, PromotedBitwiseVT);
     setOperationAction(ISD::XOR, VT, Promote);
     AddPromotedToType (ISD::XOR, VT, PromotedBitwiseVT);
   }
 
   // Neon does not support vector divide/remainder operations.
   setOperationAction(ISD::SDIV, VT, Expand);
   setOperationAction(ISD::UDIV, VT, Expand);
   setOperationAction(ISD::FDIV, VT, Expand);
   setOperationAction(ISD::SREM, VT, Expand);
   setOperationAction(ISD::UREM, VT, Expand);
   setOperationAction(ISD::FREM, VT, Expand);
 
   if (!VT.isFloatingPoint() &&
       VT != MVT::v2i64 && VT != MVT::v1i64)
     for (auto Opcode : {ISD::ABS, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
       setOperationAction(Opcode, VT, Legal);
 }
 
 void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
   addRegisterClass(VT, &ARM::DPRRegClass);
   addTypeForNEON(VT, MVT::f64, MVT::v2i32);
 }
 
 void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
   addRegisterClass(VT, &ARM::DPairRegClass);
   addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
 }
 
 ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
                                      const ARMSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
   RegInfo = Subtarget->getRegisterInfo();
   Itins = Subtarget->getInstrItineraryData();
 
   setBooleanContents(ZeroOrOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetIOS() &&
       !Subtarget->isTargetWatchOS()) {
     bool IsHFTarget = TM.Options.FloatABIType == FloatABI::Hard;
     for (int LCID = 0; LCID < RTLIB::UNKNOWN_LIBCALL; ++LCID)
       setLibcallCallingConv(static_cast<RTLIB::Libcall>(LCID),
                             IsHFTarget ? CallingConv::ARM_AAPCS_VFP
                                        : CallingConv::ARM_AAPCS);
   }
 
   if (Subtarget->isTargetMachO()) {
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
         Subtarget->hasARMOps() && !Subtarget->useSoftFloat()) {
       static const struct {
         const RTLIB::Libcall Op;
         const char * const Name;
         const ISD::CondCode Cond;
       } LibraryCalls[] = {
         // Single-precision floating-point arithmetic.
         { RTLIB::ADD_F32, "__addsf3vfp", ISD::SETCC_INVALID },
         { RTLIB::SUB_F32, "__subsf3vfp", ISD::SETCC_INVALID },
         { RTLIB::MUL_F32, "__mulsf3vfp", ISD::SETCC_INVALID },
         { RTLIB::DIV_F32, "__divsf3vfp", ISD::SETCC_INVALID },
 
         // Double-precision floating-point arithmetic.
         { RTLIB::ADD_F64, "__adddf3vfp", ISD::SETCC_INVALID },
         { RTLIB::SUB_F64, "__subdf3vfp", ISD::SETCC_INVALID },
         { RTLIB::MUL_F64, "__muldf3vfp", ISD::SETCC_INVALID },
         { RTLIB::DIV_F64, "__divdf3vfp", ISD::SETCC_INVALID },
 
         // Single-precision comparisons.
         { RTLIB::OEQ_F32, "__eqsf2vfp",    ISD::SETNE },
         { RTLIB::UNE_F32, "__nesf2vfp",    ISD::SETNE },
         { RTLIB::OLT_F32, "__ltsf2vfp",    ISD::SETNE },
         { RTLIB::OLE_F32, "__lesf2vfp",    ISD::SETNE },
         { RTLIB::OGE_F32, "__gesf2vfp",    ISD::SETNE },
         { RTLIB::OGT_F32, "__gtsf2vfp",    ISD::SETNE },
         { RTLIB::UO_F32,  "__unordsf2vfp", ISD::SETNE },
         { RTLIB::O_F32,   "__unordsf2vfp", ISD::SETEQ },
 
         // Double-precision comparisons.
         { RTLIB::OEQ_F64, "__eqdf2vfp",    ISD::SETNE },
         { RTLIB::UNE_F64, "__nedf2vfp",    ISD::SETNE },
         { RTLIB::OLT_F64, "__ltdf2vfp",    ISD::SETNE },
         { RTLIB::OLE_F64, "__ledf2vfp",    ISD::SETNE },
         { RTLIB::OGE_F64, "__gedf2vfp",    ISD::SETNE },
         { RTLIB::OGT_F64, "__gtdf2vfp",    ISD::SETNE },
         { RTLIB::UO_F64,  "__unorddf2vfp", ISD::SETNE },
         { RTLIB::O_F64,   "__unorddf2vfp", ISD::SETEQ },
 
         // Floating-point to integer conversions.
         // i64 conversions are done via library routines even when generating VFP
         // instructions, so use the same ones.
         { RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp",    ISD::SETCC_INVALID },
         { RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp", ISD::SETCC_INVALID },
         { RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp",    ISD::SETCC_INVALID },
         { RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp", ISD::SETCC_INVALID },
 
         // Conversions between floating types.
         { RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp",  ISD::SETCC_INVALID },
         { RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp", ISD::SETCC_INVALID },
 
         // Integer to floating-point conversions.
         // i64 conversions are done via library routines even when generating VFP
         // instructions, so use the same ones.
         // FIXME: There appears to be some naming inconsistency in ARM libgcc:
         // e.g., __floatunsidf vs. __floatunssidfvfp.
         { RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp",    ISD::SETCC_INVALID },
         { RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp", ISD::SETCC_INVALID },
         { RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp",    ISD::SETCC_INVALID },
         { RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp", ISD::SETCC_INVALID },
       };
 
       for (const auto &LC : LibraryCalls) {
         setLibcallName(LC.Op, LC.Name);
         if (LC.Cond != ISD::SETCC_INVALID)
           setCmpLibcallCC(LC.Op, LC.Cond);
       }
     }
   }
 
   // These libcalls are not available in 32-bit.
   setLibcallName(RTLIB::SHL_I128, nullptr);
   setLibcallName(RTLIB::SRL_I128, nullptr);
   setLibcallName(RTLIB::SRA_I128, nullptr);
 
   // RTLIB
   if (Subtarget->isAAPCS_ABI() &&
       (Subtarget->isTargetAEABI() || Subtarget->isTargetGNUAEABI() ||
        Subtarget->isTargetMuslAEABI() || Subtarget->isTargetAndroid())) {
     static const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
       const CallingConv::ID CC;
       const ISD::CondCode Cond;
     } LibraryCalls[] = {
       // Double-precision floating-point arithmetic helper functions
       // RTABI chapter 4.1.2, Table 2
       { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 
       // Double-precision floating-point comparison helper functions
       // RTABI chapter 4.1.2, Table 3
       { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
       { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
 
       // Single-precision floating-point arithmetic helper functions
       // RTABI chapter 4.1.2, Table 4
       { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 
       // Single-precision floating-point comparison helper functions
       // RTABI chapter 4.1.2, Table 5
       { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
       { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
       { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
 
       // Floating-point to integer conversions.
       // RTABI chapter 4.1.2, Table 6
       { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 
       // Conversions between floating types.
       // RTABI chapter 4.1.2, Table 7
       { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 
       // Integer to floating-point conversions.
       // RTABI chapter 4.1.2, Table 8
       { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 
       // Long long helper functions
       // RTABI chapter 4.2, Table 9
       { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
 
       // Integer division functions
       // RTABI chapter 4.3.1
       { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
     };
 
     for (const auto &LC : LibraryCalls) {
       setLibcallName(LC.Op, LC.Name);
       setLibcallCallingConv(LC.Op, LC.CC);
       if (LC.Cond != ISD::SETCC_INVALID)
         setCmpLibcallCC(LC.Op, LC.Cond);
     }
 
     // EABI dependent RTLIB
     if (TM.Options.EABIVersion == EABI::EABI4 ||
         TM.Options.EABIVersion == EABI::EABI5) {
       static const struct {
         const RTLIB::Libcall Op;
         const char *const Name;
         const CallingConv::ID CC;
         const ISD::CondCode Cond;
       } MemOpsLibraryCalls[] = {
         // Memory operations
         // RTABI chapter 4.3.4
         { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
         { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
         { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
       };
 
       for (const auto &LC : MemOpsLibraryCalls) {
         setLibcallName(LC.Op, LC.Name);
         setLibcallCallingConv(LC.Op, LC.CC);
         if (LC.Cond != ISD::SETCC_INVALID)
           setCmpLibcallCC(LC.Op, LC.Cond);
       }
     }
   }
 
   if (Subtarget->isTargetWindows()) {
     static const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
       const CallingConv::ID CC;
     } LibraryCalls[] = {
       { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
       { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
     };
 
     for (const auto &LC : LibraryCalls) {
       setLibcallName(LC.Op, LC.Name);
       setLibcallCallingConv(LC.Op, LC.CC);
     }
   }
 
   // Use divmod compiler-rt calls for iOS 5.0 and later.
   if (Subtarget->isTargetMachO() &&
       !(Subtarget->isTargetIOS() &&
         Subtarget->getTargetTriple().isOSVersionLT(5, 0))) {
     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
   }
 
   // The half <-> float conversion functions are always soft-float on
   // non-watchos platforms, but are needed for some targets which use a
   // hard-float calling convention by default.
   if (!Subtarget->isTargetWatchABI()) {
     if (Subtarget->isAAPCS_ABI()) {
       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_AAPCS);
       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_AAPCS);
       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_AAPCS);
     } else {
       setLibcallCallingConv(RTLIB::FPROUND_F32_F16, CallingConv::ARM_APCS);
       setLibcallCallingConv(RTLIB::FPROUND_F64_F16, CallingConv::ARM_APCS);
       setLibcallCallingConv(RTLIB::FPEXT_F16_F32, CallingConv::ARM_APCS);
     }
   }
 
   // In EABI, these functions have an __aeabi_ prefix, but in GNUEABI they have
   // a __gnu_ prefix (which is the default).
   if (Subtarget->isTargetAEABI()) {
     static const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
       const CallingConv::ID CC;
     } LibraryCalls[] = {
       { RTLIB::FPROUND_F32_F16, "__aeabi_f2h", CallingConv::ARM_AAPCS },
       { RTLIB::FPROUND_F64_F16, "__aeabi_d2h", CallingConv::ARM_AAPCS },
       { RTLIB::FPEXT_F16_F32, "__aeabi_h2f", CallingConv::ARM_AAPCS },
     };
 
     for (const auto &LC : LibraryCalls) {
       setLibcallName(LC.Op, LC.Name);
       setLibcallCallingConv(LC.Op, LC.CC);
     }
   }
 
   if (Subtarget->isThumb1Only())
     addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
   else
     addRegisterClass(MVT::i32, &ARM::GPRRegClass);
 
   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
       !Subtarget->isThumb1Only()) {
     addRegisterClass(MVT::f32, &ARM::SPRRegClass);
     addRegisterClass(MVT::f64, &ARM::DPRRegClass);
   }
 
   if (Subtarget->hasFullFP16()) {
     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
     setOperationAction(ISD::BITCAST, MVT::i32, Custom);
     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 
     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
   }
 
   for (MVT VT : MVT::vector_valuetypes()) {
     for (MVT InnerVT : MVT::vector_valuetypes()) {
       setTruncStoreAction(VT, InnerVT, Expand);
       setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
       setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
       setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
     }
 
     setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
     setOperationAction(ISD::BSWAP, VT, Expand);
   }
 
   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
   setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 
   setOperationAction(ISD::READ_REGISTER, MVT::i64, Custom);
   setOperationAction(ISD::WRITE_REGISTER, MVT::i64, Custom);
 
   if (Subtarget->hasNEON()) {
     addDRTypeForNEON(MVT::v2f32);
     addDRTypeForNEON(MVT::v8i8);
     addDRTypeForNEON(MVT::v4i16);
     addDRTypeForNEON(MVT::v2i32);
     addDRTypeForNEON(MVT::v1i64);
 
     addQRTypeForNEON(MVT::v4f32);
     addQRTypeForNEON(MVT::v2f64);
     addQRTypeForNEON(MVT::v16i8);
     addQRTypeForNEON(MVT::v8i16);
     addQRTypeForNEON(MVT::v4i32);
     addQRTypeForNEON(MVT::v2i64);
 
     if (Subtarget->hasFullFP16()) {
       addQRTypeForNEON(MVT::v8f16);
       addDRTypeForNEON(MVT::v4f16);
     }
 
     // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
     // neither Neon nor VFP support any arithmetic operations on it.
     // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
     // supported for v4f32.
     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
     setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
     // FIXME: Code duplication: FDIV and FREM are expanded always, see
     // ARMTargetLowering::addTypeForNEON method for details.
     setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
     // FIXME: Create unittest.
     // In another words, find a way when "copysign" appears in DAG with vector
     // operands.
     setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
     // FIXME: Code duplication: SETCC has custom operation action, see
     // ARMTargetLowering::addTypeForNEON method for details.
     setOperationAction(ISD::SETCC, MVT::v2f64, Expand);
     // FIXME: Create unittest for FNEG and for FABS.
     setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
     setOperationAction(ISD::FABS, MVT::v2f64, Expand);
     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
     setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
     setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
     setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
     // FIXME: Create unittest for FCEIL, FTRUNC, FRINT, FNEARBYINT, FFLOOR.
     setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
     setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
     setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
     setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
     setOperationAction(ISD::FMA, MVT::v2f64, Expand);
 
     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
     setOperationAction(ISD::FCEIL, MVT::v4f32, Expand);
     setOperationAction(ISD::FTRUNC, MVT::v4f32, Expand);
     setOperationAction(ISD::FRINT, MVT::v4f32, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Expand);
     setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
 
     // Mark v2f32 intrinsics.
     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
     setOperationAction(ISD::FLOG10, MVT::v2f32, Expand);
     setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
     setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
     setOperationAction(ISD::FCEIL, MVT::v2f32, Expand);
     setOperationAction(ISD::FTRUNC, MVT::v2f32, Expand);
     setOperationAction(ISD::FRINT, MVT::v2f32, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Expand);
     setOperationAction(ISD::FFLOOR, MVT::v2f32, Expand);
 
     // Neon does not support some operations on v1i64 and v2i64 types.
     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
     // Custom handling for some quad-vector types to detect VMULL.
     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
     // Custom handling for some vector types to avoid expensive expansions
     setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
     setOperationAction(ISD::SDIV, MVT::v8i8, Custom);
     setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
     setOperationAction(ISD::UDIV, MVT::v8i8, Custom);
     // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
     // a destination type that is wider than the source, and nor does
     // it have a FP_TO_[SU]INT instruction with a narrower destination than
     // source.
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
     setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
 
     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
 
     // NEON does not have single instruction CTPOP for vectors with element
     // types wider than 8-bits.  However, custom lowering can leverage the
     // v8i8/v16i8 vcnt instruction.
     setOperationAction(ISD::CTPOP,      MVT::v2i32, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v4i32, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v4i16, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v8i16, Custom);
     setOperationAction(ISD::CTPOP,      MVT::v1i64, Expand);
     setOperationAction(ISD::CTPOP,      MVT::v2i64, Expand);
 
     setOperationAction(ISD::CTLZ,       MVT::v1i64, Expand);
     setOperationAction(ISD::CTLZ,       MVT::v2i64, Expand);
 
     // NEON does not have single instruction CTTZ for vectors.
     setOperationAction(ISD::CTTZ, MVT::v8i8, Custom);
     setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
     setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
     setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
 
     setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
     setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
     setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
     setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
 
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i8, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i16, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i32, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v1i64, Custom);
 
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i8, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i16, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
 
     // NEON only has FMA instructions as of VFP4.
     if (!Subtarget->hasVFP4()) {
       setOperationAction(ISD::FMA, MVT::v2f32, Expand);
       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
     }
 
     setTargetDAGCombine(ISD::INTRINSIC_VOID);
     setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
     setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
     setTargetDAGCombine(ISD::SHL);
     setTargetDAGCombine(ISD::SRL);
     setTargetDAGCombine(ISD::SRA);
     setTargetDAGCombine(ISD::SIGN_EXTEND);
     setTargetDAGCombine(ISD::ZERO_EXTEND);
     setTargetDAGCombine(ISD::ANY_EXTEND);
     setTargetDAGCombine(ISD::BUILD_VECTOR);
     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
     setTargetDAGCombine(ISD::STORE);
     setTargetDAGCombine(ISD::FP_TO_SINT);
     setTargetDAGCombine(ISD::FP_TO_UINT);
     setTargetDAGCombine(ISD::FDIV);
     setTargetDAGCombine(ISD::LOAD);
 
     // It is legal to extload from v4i8 to v4i16 or v4i32.
     for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
                    MVT::v2i32}) {
       for (MVT VT : MVT::integer_vector_valuetypes()) {
         setLoadExtAction(ISD::EXTLOAD, VT, Ty, Legal);
         setLoadExtAction(ISD::ZEXTLOAD, VT, Ty, Legal);
         setLoadExtAction(ISD::SEXTLOAD, VT, Ty, Legal);
       }
     }
   }
 
   if (Subtarget->isFPOnlySP()) {
     // When targeting a floating-point unit with only single-precision
     // operations, f64 is legal for the few double-precision instructions which
     // are present However, no double-precision operations other than moves,
     // loads and stores are provided by the hardware.
     setOperationAction(ISD::FADD,       MVT::f64, Expand);
     setOperationAction(ISD::FSUB,       MVT::f64, Expand);
     setOperationAction(ISD::FMUL,       MVT::f64, Expand);
     setOperationAction(ISD::FMA,        MVT::f64, Expand);
     setOperationAction(ISD::FDIV,       MVT::f64, Expand);
     setOperationAction(ISD::FREM,       MVT::f64, Expand);
     setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
     setOperationAction(ISD::FGETSIGN,   MVT::f64, Expand);
     setOperationAction(ISD::FNEG,       MVT::f64, Expand);
     setOperationAction(ISD::FABS,       MVT::f64, Expand);
     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
     setOperationAction(ISD::FLOG10,     MVT::f64, Expand);
     setOperationAction(ISD::FEXP,       MVT::f64, Expand);
     setOperationAction(ISD::FEXP2,      MVT::f64, Expand);
     setOperationAction(ISD::FCEIL,      MVT::f64, Expand);
     setOperationAction(ISD::FTRUNC,     MVT::f64, Expand);
     setOperationAction(ISD::FRINT,      MVT::f64, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::f64, Expand);
     setOperationAction(ISD::FFLOOR,     MVT::f64, Expand);
     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_SINT, MVT::f64, Custom);
     setOperationAction(ISD::FP_TO_UINT, MVT::f64, Custom);
     setOperationAction(ISD::FP_ROUND,   MVT::f32, Custom);
     setOperationAction(ISD::FP_EXTEND,  MVT::f64, Custom);
   }
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
   // ARM does not have floating-point extending loads.
   for (MVT VT : MVT::fp_valuetypes()) {
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
   }
 
   // ... or truncating stores
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
 
   // ARM does not have i1 sign extending load.
   for (MVT VT : MVT::integer_valuetypes())
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 
   // ARM supports all 4 flavors of integer indexed load / store.
   if (!Subtarget->isThumb1Only()) {
     for (unsigned im = (unsigned)ISD::PRE_INC;
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
       setIndexedLoadAction(im,  MVT::i1,  Legal);
       setIndexedLoadAction(im,  MVT::i8,  Legal);
       setIndexedLoadAction(im,  MVT::i16, Legal);
       setIndexedLoadAction(im,  MVT::i32, Legal);
       setIndexedStoreAction(im, MVT::i1,  Legal);
       setIndexedStoreAction(im, MVT::i8,  Legal);
       setIndexedStoreAction(im, MVT::i16, Legal);
       setIndexedStoreAction(im, MVT::i32, Legal);
     }
   } else {
     // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
     setIndexedLoadAction(ISD::POST_INC, MVT::i32,  Legal);
     setIndexedStoreAction(ISD::POST_INC, MVT::i32,  Legal);
   }
 
   setOperationAction(ISD::SADDO, MVT::i32, Custom);
   setOperationAction(ISD::UADDO, MVT::i32, Custom);
   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
   setOperationAction(ISD::USUBO, MVT::i32, Custom);
 
   setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
   setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
 
   // i64 operation support.
   setOperationAction(ISD::MUL,     MVT::i64, Expand);
   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
   if (Subtarget->isThumb1Only()) {
     setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
     setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
   }
   if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
       || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
     setOperationAction(ISD::MULHS, MVT::i32, Expand);
 
   setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
   setOperationAction(ISD::SRL,       MVT::i64, Custom);
   setOperationAction(ISD::SRA,       MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
 
   // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
   if (Subtarget->isThumb1Only()) {
     setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
     setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
     setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
   }
 
   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
 
   // ARM does not have ROTL.
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
   for (MVT VT : MVT::vector_valuetypes()) {
     setOperationAction(ISD::ROTL, VT, Expand);
     setOperationAction(ISD::ROTR, VT, Expand);
   }
   setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
   if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
 
   // @llvm.readcyclecounter requires the Performance Monitors extension.
   // Default to the 0 expansion on unsupported platforms.
   // FIXME: Technically there are older ARM CPUs that have
   // implementation-specific ways of obtaining this information.
   if (Subtarget->hasPerfMon())
     setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
 
   // Only ARMv6 has BSWAP.
   if (!Subtarget->hasV6Ops())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
 
   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
                                         : Subtarget->hasDivideInARMMode();
   if (!hasDivide) {
     // These are expanded into libcalls if the cpu doesn't have HW divider.
     setOperationAction(ISD::SDIV,  MVT::i32, LibCall);
     setOperationAction(ISD::UDIV,  MVT::i32, LibCall);
   }
 
   if (Subtarget->isTargetWindows() && !Subtarget->hasDivideInThumbMode()) {
     setOperationAction(ISD::SDIV, MVT::i32, Custom);
     setOperationAction(ISD::UDIV, MVT::i32, Custom);
 
     setOperationAction(ISD::SDIV, MVT::i64, Custom);
     setOperationAction(ISD::UDIV, MVT::i64, Custom);
   }
 
   setOperationAction(ISD::SREM,  MVT::i32, Expand);
   setOperationAction(ISD::UREM,  MVT::i32, Expand);
 
   // Register based DivRem for AEABI (RTABI 4.2)
   if (Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
       Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
       Subtarget->isTargetWindows()) {
     setOperationAction(ISD::SREM, MVT::i64, Custom);
     setOperationAction(ISD::UREM, MVT::i64, Custom);
     HasStandaloneRem = false;
 
     if (Subtarget->isTargetWindows()) {
       const struct {
         const RTLIB::Libcall Op;
         const char * const Name;
         const CallingConv::ID CC;
       } LibraryCalls[] = {
         { RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS },
         { RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS },
         { RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS },
         { RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS },
 
         { RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS },
         { RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS },
         { RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS },
         { RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS },
       };
 
       for (const auto &LC : LibraryCalls) {
         setLibcallName(LC.Op, LC.Name);
         setLibcallCallingConv(LC.Op, LC.CC);
       }
     } else {
       const struct {
         const RTLIB::Libcall Op;
         const char * const Name;
         const CallingConv::ID CC;
       } LibraryCalls[] = {
         { RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
         { RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
         { RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS },
         { RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS },
 
         { RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
         { RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
         { RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS },
         { RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS },
       };
 
       for (const auto &LC : LibraryCalls) {
         setLibcallName(LC.Op, LC.Name);
         setLibcallCallingConv(LC.Op, LC.CC);
       }
     }
 
     setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
     setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
     setOperationAction(ISD::SDIVREM, MVT::i64, Custom);
     setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
   } else {
     setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
     setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
   }
 
   if (Subtarget->isTargetWindows() && Subtarget->getTargetTriple().isOSMSVCRT())
     for (auto &VT : {MVT::f32, MVT::f64})
       setOperationAction(ISD::FPOWI, VT, Custom);
 
   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
   // Use the default implementation.
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
   setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
   setOperationAction(ISD::VAEND,              MVT::Other, Expand);
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
   if (Subtarget->isTargetWindows())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
   else
     setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
   InsertFencesForAtomic = false;
   if (Subtarget->hasAnyDataBarrier() &&
       (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
     // ATOMIC_FENCE needs custom lowering; the others should have been expanded
     // to ldrex/strex loops already.
     setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
     if (!Subtarget->isThumb() || !Subtarget->isMClass())
       setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
 
     // On v8, we have particularly efficient implementations of atomic fences
     // if they can be combined with nearby atomic loads and stores.
     if (!Subtarget->hasV8Ops() || getTargetMachine().getOptLevel() == 0) {
       // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
       InsertFencesForAtomic = true;
     }
   } else {
     // If there's anything we can use as a barrier, go through custom lowering
     // for ATOMIC_FENCE.
     // If target has DMB in thumb, Fences can be inserted.
     if (Subtarget->hasDataBarrier())
       InsertFencesForAtomic = true;
 
     setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
                        Subtarget->hasAnyDataBarrier() ? Custom : Expand);
 
     // Set them all for expansion, which will force libcalls.
     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_AND,  MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_OR,   MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_XOR,  MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i32, Expand);
     // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
     // Unordered/Monotonic case.
     if (!InsertFencesForAtomic) {
       setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Custom);
       setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Custom);
     }
   }
 
   setOperationAction(ISD::PREFETCH,         MVT::Other, Custom);
 
   // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
   if (!Subtarget->hasV6Ops()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
   }
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
       !Subtarget->isThumb1Only()) {
     // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
     // iff target supports vfp2.
     setOperationAction(ISD::BITCAST, MVT::i64, Custom);
     setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
   }
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
   setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
   if (Subtarget->useSjLjEH())
     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
 
   setOperationAction(ISD::SETCC,     MVT::i32, Expand);
   setOperationAction(ISD::SETCC,     MVT::f32, Expand);
   setOperationAction(ISD::SETCC,     MVT::f64, Expand);
   setOperationAction(ISD::SELECT,    MVT::i32, Custom);
   setOperationAction(ISD::SELECT,    MVT::f32, Custom);
   setOperationAction(ISD::SELECT,    MVT::f64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
   if (Subtarget->hasFullFP16()) {
     setOperationAction(ISD::SETCC,     MVT::f16, Expand);
     setOperationAction(ISD::SELECT,    MVT::f16, Custom);
     setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
   }
 
   setOperationAction(ISD::SETCCCARRY, MVT::i32, Custom);
 
   setOperationAction(ISD::BRCOND,    MVT::Other, Custom);
   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
   if (Subtarget->hasFullFP16())
       setOperationAction(ISD::BR_CC, MVT::f16,   Custom);
   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
   setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
 
   // We don't support sin/cos/fmod/copysign/pow
   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
   setOperationAction(ISD::FCOS,      MVT::f32, Expand);
   setOperationAction(ISD::FCOS,      MVT::f64, Expand);
   setOperationAction(ISD::FSINCOS,   MVT::f64, Expand);
   setOperationAction(ISD::FSINCOS,   MVT::f32, Expand);
   setOperationAction(ISD::FREM,      MVT::f64, Expand);
   setOperationAction(ISD::FREM,      MVT::f32, Expand);
   if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2() &&
       !Subtarget->isThumb1Only()) {
     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
   }
   setOperationAction(ISD::FPOW,      MVT::f64, Expand);
   setOperationAction(ISD::FPOW,      MVT::f32, Expand);
 
   if (!Subtarget->hasVFP4()) {
     setOperationAction(ISD::FMA, MVT::f64, Expand);
     setOperationAction(ISD::FMA, MVT::f32, Expand);
   }
 
   // Various VFP goodness
   if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
     // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
     if (!Subtarget->hasFPARMv8() || Subtarget->isFPOnlySP()) {
       setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
     }
 
     // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
     if (!Subtarget->hasFP16()) {
       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
     }
   }
 
   // Use __sincos_stret if available.
   if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
       getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   }
 
   // FP-ARMv8 implements a lot of rounding-like FP operations.
   if (Subtarget->hasFPARMv8()) {
     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
     setOperationAction(ISD::FCEIL, MVT::f32, Legal);
     setOperationAction(ISD::FROUND, MVT::f32, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
     setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
     setOperationAction(ISD::FRINT, MVT::f32, Legal);
     setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
     setOperationAction(ISD::FMINNUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal);
 
     if (!Subtarget->isFPOnlySP()) {
       setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
       setOperationAction(ISD::FCEIL, MVT::f64, Legal);
       setOperationAction(ISD::FROUND, MVT::f64, Legal);
       setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
       setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
       setOperationAction(ISD::FRINT, MVT::f64, Legal);
       setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
       setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
     }
   }
 
   if (Subtarget->hasNEON()) {
     // vmin and vmax aren't available in a scalar form, so we use
     // a NEON instruction with an undef lane instead.
     setOperationAction(ISD::FMINNAN, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNAN, MVT::f16, Legal);
     setOperationAction(ISD::FMINNAN, MVT::f32, Legal);
     setOperationAction(ISD::FMAXNAN, MVT::f32, Legal);
     setOperationAction(ISD::FMINNAN, MVT::v2f32, Legal);
     setOperationAction(ISD::FMAXNAN, MVT::v2f32, Legal);
     setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal);
     setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal);
   }
 
   // We have target-specific dag combine patterns for the following nodes:
   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::XOR);
 
   if (Subtarget->hasV6Ops())
     setTargetDAGCombine(ISD::SRL);
 
   setStackPointerRegisterToSaveRestore(ARM::SP);
 
   if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
       !Subtarget->hasVFP2())
     setSchedulingPreference(Sched::RegPressure);
   else
     setSchedulingPreference(Sched::Hybrid);
 
   //// temporary - rewrite interface to use type
   MaxStoresPerMemset = 8;
   MaxStoresPerMemsetOptSize = 4;
   MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
   MaxStoresPerMemcpyOptSize = 2;
   MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 2;
 
   // On ARM arguments smaller than 4 bytes are extended, so all arguments
   // are at least 4 bytes aligned.
   setMinStackArgumentAlignment(4);
 
   // Prefer likely predicted branches to selects on out-of-order cores.
   PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
 
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
 
 bool ARMTargetLowering::useSoftFloat() const {
   return Subtarget->useSoftFloat();
 }
 
 // FIXME: It might make sense to define the representative register class as the
 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
 // SPR's representative would be DPR_VFP2. This should work well if register
 // pressure tracking were modified such that a register use would increment the
 // pressure of the register class's representative and all of it's super
 // classes' representatives transitively. We have not implemented this because
 // of the difficulty prior to coalescing of modeling operand register classes
 // due to the common occurrence of cross class copies and subregister insertions
 // and extractions.
 std::pair<const TargetRegisterClass *, uint8_t>
 ARMTargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
                                            MVT VT) const {
   const TargetRegisterClass *RRC = nullptr;
   uint8_t Cost = 1;
   switch (VT.SimpleTy) {
   default:
     return TargetLowering::findRepresentativeClass(TRI, VT);
   // Use DPR as representative register class for all floating point
   // and vector types. Since there are 32 SPR registers and 32 DPR registers so
   // the cost is 1 for both f32 and f64.
   case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
   case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
     RRC = &ARM::DPRRegClass;
     // When NEON is used for SP, only half of the register file is available
     // because operations that define both SP and DP results will be constrained
     // to the VFP2 class (D0-D15). We currently model this constraint prior to
     // coalescing by double-counting the SP regs. See the FIXME above.
     if (Subtarget->useNEONForSinglePrecisionFP())
       Cost = 2;
     break;
   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
   case MVT::v4f32: case MVT::v2f64:
     RRC = &ARM::DPRRegClass;
     Cost = 2;
     break;
   case MVT::v4i64:
     RRC = &ARM::DPRRegClass;
     Cost = 4;
     break;
   case MVT::v8i64:
     RRC = &ARM::DPRRegClass;
     Cost = 8;
     break;
   }
   return std::make_pair(RRC, Cost);
 }
 
 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((ARMISD::NodeType)Opcode) {
   case ARMISD::FIRST_NUMBER:  break;
   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
   case ARMISD::COPY_STRUCT_BYVAL: return "ARMISD::COPY_STRUCT_BYVAL";
   case ARMISD::CALL:          return "ARMISD::CALL";
   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
   case ARMISD::CMP:           return "ARMISD::CMP";
   case ARMISD::CMN:           return "ARMISD::CMN";
   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
 
   case ARMISD::CMOV:          return "ARMISD::CMOV";
 
   case ARMISD::SSAT:          return "ARMISD::SSAT";
   case ARMISD::USAT:          return "ARMISD::USAT";
 
   case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
   case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
   case ARMISD::RRX:           return "ARMISD::RRX";
 
   case ARMISD::ADDC:          return "ARMISD::ADDC";
   case ARMISD::ADDE:          return "ARMISD::ADDE";
   case ARMISD::SUBC:          return "ARMISD::SUBC";
   case ARMISD::SUBE:          return "ARMISD::SUBE";
 
   case ARMISD::VMOVRRD:       return "ARMISD::VMOVRRD";
   case ARMISD::VMOVDRR:       return "ARMISD::VMOVDRR";
   case ARMISD::VMOVhr:        return "ARMISD::VMOVhr";
   case ARMISD::VMOVrh:        return "ARMISD::VMOVrh";
   case ARMISD::VMOVSR:        return "ARMISD::VMOVSR";
 
   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
   case ARMISD::EH_SJLJ_LONGJMP: return "ARMISD::EH_SJLJ_LONGJMP";
   case ARMISD::EH_SJLJ_SETUP_DISPATCH: return "ARMISD::EH_SJLJ_SETUP_DISPATCH";
 
   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
 
   case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
 
   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
 
   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
 
   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
 
   case ARMISD::WIN__CHKSTK:   return "ARMISD::WIN__CHKSTK";
   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
 
   case ARMISD::VCEQ:          return "ARMISD::VCEQ";
   case ARMISD::VCEQZ:         return "ARMISD::VCEQZ";
   case ARMISD::VCGE:          return "ARMISD::VCGE";
   case ARMISD::VCGEZ:         return "ARMISD::VCGEZ";
   case ARMISD::VCLEZ:         return "ARMISD::VCLEZ";
   case ARMISD::VCGEU:         return "ARMISD::VCGEU";
   case ARMISD::VCGT:          return "ARMISD::VCGT";
   case ARMISD::VCGTZ:         return "ARMISD::VCGTZ";
   case ARMISD::VCLTZ:         return "ARMISD::VCLTZ";
   case ARMISD::VCGTU:         return "ARMISD::VCGTU";
   case ARMISD::VTST:          return "ARMISD::VTST";
 
   case ARMISD::VSHL:          return "ARMISD::VSHL";
   case ARMISD::VSHRs:         return "ARMISD::VSHRs";
   case ARMISD::VSHRu:         return "ARMISD::VSHRu";
   case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
   case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
   case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
   case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
   case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
   case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
   case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
   case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
   case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
   case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
   case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
   case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
   case ARMISD::VSLI:          return "ARMISD::VSLI";
   case ARMISD::VSRI:          return "ARMISD::VSRI";
   case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
   case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
   case ARMISD::VDUP:          return "ARMISD::VDUP";
   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
   case ARMISD::VEXT:          return "ARMISD::VEXT";
   case ARMISD::VREV64:        return "ARMISD::VREV64";
   case ARMISD::VREV32:        return "ARMISD::VREV32";
   case ARMISD::VREV16:        return "ARMISD::VREV16";
   case ARMISD::VZIP:          return "ARMISD::VZIP";
   case ARMISD::VUZP:          return "ARMISD::VUZP";
   case ARMISD::VTRN:          return "ARMISD::VTRN";
   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
   case ARMISD::SMLALBB:       return "ARMISD::SMLALBB";
   case ARMISD::SMLALBT:       return "ARMISD::SMLALBT";
   case ARMISD::SMLALTB:       return "ARMISD::SMLALTB";
   case ARMISD::SMLALTT:       return "ARMISD::SMLALTT";
   case ARMISD::SMULWB:        return "ARMISD::SMULWB";
   case ARMISD::SMULWT:        return "ARMISD::SMULWT";
   case ARMISD::SMLALD:        return "ARMISD::SMLALD";
   case ARMISD::SMLALDX:       return "ARMISD::SMLALDX";
   case ARMISD::SMLSLD:        return "ARMISD::SMLSLD";
   case ARMISD::SMLSLDX:       return "ARMISD::SMLSLDX";
   case ARMISD::SMMLAR:        return "ARMISD::SMMLAR";
   case ARMISD::SMMLSR:        return "ARMISD::SMMLSR";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
   case ARMISD::VBSL:          return "ARMISD::VBSL";
   case ARMISD::MEMCPY:        return "ARMISD::MEMCPY";
   case ARMISD::VLD1DUP:       return "ARMISD::VLD1DUP";
   case ARMISD::VLD2DUP:       return "ARMISD::VLD2DUP";
   case ARMISD::VLD3DUP:       return "ARMISD::VLD3DUP";
   case ARMISD::VLD4DUP:       return "ARMISD::VLD4DUP";
   case ARMISD::VLD1_UPD:      return "ARMISD::VLD1_UPD";
   case ARMISD::VLD2_UPD:      return "ARMISD::VLD2_UPD";
   case ARMISD::VLD3_UPD:      return "ARMISD::VLD3_UPD";
   case ARMISD::VLD4_UPD:      return "ARMISD::VLD4_UPD";
   case ARMISD::VLD2LN_UPD:    return "ARMISD::VLD2LN_UPD";
   case ARMISD::VLD3LN_UPD:    return "ARMISD::VLD3LN_UPD";
   case ARMISD::VLD4LN_UPD:    return "ARMISD::VLD4LN_UPD";
   case ARMISD::VLD1DUP_UPD:   return "ARMISD::VLD1DUP_UPD";
   case ARMISD::VLD2DUP_UPD:   return "ARMISD::VLD2DUP_UPD";
   case ARMISD::VLD3DUP_UPD:   return "ARMISD::VLD3DUP_UPD";
   case ARMISD::VLD4DUP_UPD:   return "ARMISD::VLD4DUP_UPD";
   case ARMISD::VST1_UPD:      return "ARMISD::VST1_UPD";
   case ARMISD::VST2_UPD:      return "ARMISD::VST2_UPD";
   case ARMISD::VST3_UPD:      return "ARMISD::VST3_UPD";
   case ARMISD::VST4_UPD:      return "ARMISD::VST4_UPD";
   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
   }
   return nullptr;
 }
 
 EVT ARMTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
                                           EVT VT) const {
   if (!VT.isVector())
     return getPointerTy(DL);
   return VT.changeVectorElementTypeToInteger();
 }
 
 /// getRegClassFor - Return the register class that should be used for the
 /// specified value type.
 const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
   // load / store 4 to 8 consecutive D registers.
   if (Subtarget->hasNEON()) {
     if (VT == MVT::v4i64)
       return &ARM::QQPRRegClass;
     if (VT == MVT::v8i64)
       return &ARM::QQQQPRRegClass;
   }
   return TargetLowering::getRegClassFor(VT);
 }
 
 // memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
 // source/dest is aligned and the copy size is large enough. We therefore want
 // to align such objects passed to memory intrinsics.
 bool ARMTargetLowering::shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
                                                unsigned &PrefAlign) const {
   if (!isa<MemIntrinsic>(CI))
     return false;
   MinSize = 8;
   // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
   // cycle faster than 4-byte aligned LDM.
   PrefAlign = (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? 8 : 4);
   return true;
 }
 
 // Create a fast isel object.
 FastISel *
 ARMTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                   const TargetLibraryInfo *libInfo) const {
   return ARM::createFastISel(funcInfo, libInfo);
 }
 
 Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
   unsigned NumVals = N->getNumValues();
   if (!NumVals)
     return Sched::RegPressure;
 
   for (unsigned i = 0; i != NumVals; ++i) {
     EVT VT = N->getValueType(i);
     if (VT == MVT::Glue || VT == MVT::Other)
       continue;
     if (VT.isFloatingPoint() || VT.isVector())
       return Sched::ILP;
   }
 
   if (!N->isMachineOpcode())
     return Sched::RegPressure;
 
   // Load are scheduled for latency even if there instruction itinerary
   // is not available.
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
 
   if (MCID.getNumDefs() == 0)
     return Sched::RegPressure;
   if (!Itins->isEmpty() &&
       Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2)
     return Sched::ILP;
 
   return Sched::RegPressure;
 }
 
 //===----------------------------------------------------------------------===//
 // Lowering Code
 //===----------------------------------------------------------------------===//
 
 static bool isSRL16(const SDValue &Op) {
   if (Op.getOpcode() != ISD::SRL)
     return false;
   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
     return Const->getZExtValue() == 16;
   return false;
 }
 
 static bool isSRA16(const SDValue &Op) {
   if (Op.getOpcode() != ISD::SRA)
     return false;
   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
     return Const->getZExtValue() == 16;
   return false;
 }
 
 static bool isSHL16(const SDValue &Op) {
   if (Op.getOpcode() != ISD::SHL)
     return false;
   if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
     return Const->getZExtValue() == 16;
   return false;
 }
 
 // Check for a signed 16-bit value. We special case SRA because it makes it
 // more simple when also looking for SRAs that aren't sign extending a
 // smaller value. Without the check, we'd need to take extra care with
 // checking order for some operations.
 static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
   if (isSRA16(Op))
     return isSHL16(Op.getOperand(0));
   return DAG.ComputeNumSignBits(Op) == 17;
 }
 
 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
   switch (CC) {
   default: llvm_unreachable("Unknown condition code!");
   case ISD::SETNE:  return ARMCC::NE;
   case ISD::SETEQ:  return ARMCC::EQ;
   case ISD::SETGT:  return ARMCC::GT;
   case ISD::SETGE:  return ARMCC::GE;
   case ISD::SETLT:  return ARMCC::LT;
   case ISD::SETLE:  return ARMCC::LE;
   case ISD::SETUGT: return ARMCC::HI;
   case ISD::SETUGE: return ARMCC::HS;
   case ISD::SETULT: return ARMCC::LO;
   case ISD::SETULE: return ARMCC::LS;
   }
 }
 
 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
                         ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
   CondCode2 = ARMCC::AL;
   InvalidOnQNaN = true;
   switch (CC) {
   default: llvm_unreachable("Unknown FP condition!");
   case ISD::SETEQ:
   case ISD::SETOEQ:
     CondCode = ARMCC::EQ;
     InvalidOnQNaN = false;
     break;
   case ISD::SETGT:
   case ISD::SETOGT: CondCode = ARMCC::GT; break;
   case ISD::SETGE:
   case ISD::SETOGE: CondCode = ARMCC::GE; break;
   case ISD::SETOLT: CondCode = ARMCC::MI; break;
   case ISD::SETOLE: CondCode = ARMCC::LS; break;
   case ISD::SETONE:
     CondCode = ARMCC::MI;
     CondCode2 = ARMCC::GT;
     InvalidOnQNaN = false;
     break;
   case ISD::SETO:   CondCode = ARMCC::VC; break;
   case ISD::SETUO:  CondCode = ARMCC::VS; break;
   case ISD::SETUEQ:
     CondCode = ARMCC::EQ;
     CondCode2 = ARMCC::VS;
     InvalidOnQNaN = false;
     break;
   case ISD::SETUGT: CondCode = ARMCC::HI; break;
   case ISD::SETUGE: CondCode = ARMCC::PL; break;
   case ISD::SETLT:
   case ISD::SETULT: CondCode = ARMCC::LT; break;
   case ISD::SETLE:
   case ISD::SETULE: CondCode = ARMCC::LE; break;
   case ISD::SETNE:
   case ISD::SETUNE:
     CondCode = ARMCC::NE;
     InvalidOnQNaN = false;
     break;
   }
 }
 
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
 #include "ARMGenCallingConv.inc"
 
 /// getEffectiveCallingConv - Get the effective calling convention, taking into
 /// account presence of floating point hardware and calling convention
 /// limitations, such as support for variadic functions.
 CallingConv::ID
 ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
                                            bool isVarArg) const {
   switch (CC) {
   default:
     report_fatal_error("Unsupported calling convention");
   case CallingConv::ARM_AAPCS:
   case CallingConv::ARM_APCS:
   case CallingConv::GHC:
     return CC;
   case CallingConv::PreserveMost:
     return CallingConv::PreserveMost;
   case CallingConv::ARM_AAPCS_VFP:
   case CallingConv::Swift:
     return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
   case CallingConv::C:
     if (!Subtarget->isAAPCS_ABI())
       return CallingConv::ARM_APCS;
     else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() &&
              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
              !isVarArg)
       return CallingConv::ARM_AAPCS_VFP;
     else
       return CallingConv::ARM_AAPCS;
   case CallingConv::Fast:
   case CallingConv::CXX_FAST_TLS:
     if (!Subtarget->isAAPCS_ABI()) {
       if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
         return CallingConv::Fast;
       return CallingConv::ARM_APCS;
     } else if (Subtarget->hasVFP2() && !Subtarget->isThumb1Only() && !isVarArg)
       return CallingConv::ARM_AAPCS_VFP;
     else
       return CallingConv::ARM_AAPCS;
   }
 }
 
 CCAssignFn *ARMTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                  bool isVarArg) const {
   return CCAssignFnForNode(CC, false, isVarArg);
 }
 
 CCAssignFn *ARMTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
                                                    bool isVarArg) const {
   return CCAssignFnForNode(CC, true, isVarArg);
 }
 
 /// CCAssignFnForNode - Selects the correct CCAssignFn for the given
 /// CallingConvention.
 CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
                                                  bool Return,
                                                  bool isVarArg) const {
   switch (getEffectiveCallingConv(CC, isVarArg)) {
   default:
     report_fatal_error("Unsupported calling convention");
   case CallingConv::ARM_APCS:
     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
   case CallingConv::ARM_AAPCS:
     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
   case CallingConv::ARM_AAPCS_VFP:
     return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
   case CallingConv::Fast:
     return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
   case CallingConv::GHC:
     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
   case CallingConv::PreserveMost:
     return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
   }
 }
 
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 SDValue ARMTargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
     SDValue ThisVal) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign VA = RVLocs[i];
 
     // Pass 'this' value directly from the argument to return value, to avoid
     // reg unit interference
     if (i == 0 && isThisReturn) {
       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
              "unexpected return calling convention register assignment");
       InVals.push_back(ThisVal);
       continue;
     }
 
     SDValue Val;
     if (VA.needsCustom()) {
       // Handle f64 or half of a v2f64.
       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
                                       InFlag);
       Chain = Lo.getValue(1);
       InFlag = Lo.getValue(2);
       VA = RVLocs[++i]; // skip ahead to next loc
       SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
                                       InFlag);
       Chain = Hi.getValue(1);
       InFlag = Hi.getValue(2);
       if (!Subtarget->isLittle())
         std::swap (Lo, Hi);
       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
 
       if (VA.getLocVT() == MVT::v2f64) {
         SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
         Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
                           DAG.getConstant(0, dl, MVT::i32));
 
         VA = RVLocs[++i]; // skip ahead to next loc
         Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
         Chain = Lo.getValue(1);
         InFlag = Lo.getValue(2);
         VA = RVLocs[++i]; // skip ahead to next loc
         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
         Chain = Hi.getValue(1);
         InFlag = Hi.getValue(2);
         if (!Subtarget->isLittle())
           std::swap (Lo, Hi);
         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
                           DAG.getConstant(1, dl, MVT::i32));
       }
     } else {
       Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
                                InFlag);
       Chain = Val.getValue(1);
       InFlag = Val.getValue(2);
     }
 
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::BCvt:
       Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
       break;
     }
 
     InVals.push_back(Val);
   }
 
   return Chain;
 }
 
 /// LowerMemOpCallTo - Store the argument to the stack.
 SDValue ARMTargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
                                             SDValue Arg, const SDLoc &dl,
                                             SelectionDAG &DAG,
                                             const CCValAssign &VA,
                                             ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                        StackPtr, PtrOff);
   return DAG.getStore(
       Chain, dl, Arg, PtrOff,
       MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
 }
 
 void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
                                          SDValue Chain, SDValue &Arg,
                                          RegsToPassVector &RegsToPass,
                                          CCValAssign &VA, CCValAssign &NextVA,
                                          SDValue &StackPtr,
                                          SmallVectorImpl<SDValue> &MemOpChains,
                                          ISD::ArgFlagsTy Flags) const {
   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
   unsigned id = Subtarget->isLittle() ? 0 : 1;
   RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
 
   if (NextVA.isRegLoc())
     RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
   else {
     assert(NextVA.isMemLoc());
     if (!StackPtr.getNode())
       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
                                     getPointerTy(DAG.getDataLayout()));
 
     MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
                                            dl, DAG, NextVA,
                                            Flags));
   }
 }
 
 /// LowerCall - Lowering a call into a callseq_start <-
 /// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
 /// nodes.
 SDValue
 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
   SDLoc &dl                             = CLI.DL;
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &isTailCall                      = CLI.IsTailCall;
   CallingConv::ID CallConv              = CLI.CallConv;
   bool doesNotRet                       = CLI.DoesNotReturn;
   bool isVarArg                         = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
   bool isStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool isThisReturn   = false;
   bool isSibCall      = false;
   auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
 
   // Disable tail calls if they're not supported.
   if (!Subtarget->supportsTailCall() || Attr.getValueAsString() == "true")
     isTailCall = false;
 
   if (isTailCall) {
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                     isVarArg, isStructRet, MF.getFunction().hasStructRetAttr(),
                                                    Outs, OutVals, Ins, DAG);
     if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
       report_fatal_error("failed to perform tail call elimination on a call "
                          "site marked musttail");
     // We don't support GuaranteedTailCallOpt for ARM, only automatically
     // detected sibcalls.
     if (isTailCall) {
       ++NumTailCalls;
       isSibCall = true;
     }
   }
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
   // For tail calls, memory operands are available in our caller's stack.
   if (isSibCall)
     NumBytes = 0;
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!isSibCall)
     Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl);
 
   SDValue StackPtr =
       DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
 
   RegsToPassVector RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization, arguments are handled later.
   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
        i != e;
        ++i, ++realArgIdx) {
     CCValAssign &VA = ArgLocs[i];
     SDValue Arg = OutVals[realArgIdx];
     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
     bool isByVal = Flags.isByVal();
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
       break;
     case CCValAssign::ZExt:
       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
       break;
     case CCValAssign::AExt:
       Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
       break;
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
       break;
     }
 
     // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
     if (VA.needsCustom()) {
       if (VA.getLocVT() == MVT::v2f64) {
         SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
                                   DAG.getConstant(0, dl, MVT::i32));
         SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
                                   DAG.getConstant(1, dl, MVT::i32));
 
         PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
                          VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
 
         VA = ArgLocs[++i]; // skip ahead to next loc
         if (VA.isRegLoc()) {
           PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
                            VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
         } else {
           assert(VA.isMemLoc());
 
           MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
                                                  dl, DAG, VA, Flags));
         }
       } else {
         PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
                          StackPtr, MemOpChains, Flags);
       }
     } else if (VA.isRegLoc()) {
       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
           Outs[0].VT == MVT::i32) {
         assert(VA.getLocVT() == MVT::i32 &&
                "unexpected calling convention register assignment");
         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
                "unexpected use of 'returned'");
         isThisReturn = true;
       }
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else if (isByVal) {
       assert(VA.isMemLoc());
       unsigned offset = 0;
 
       // True if this byval aggregate will be split between registers
       // and memory.
       unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
       unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
 
       if (CurByValIdx < ByValArgsCount) {
 
         unsigned RegBegin, RegEnd;
         CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
 
         EVT PtrVT =
             DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
         unsigned int i, j;
         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
                                      MachinePointerInfo(),
                                      DAG.InferPtrAlignment(AddArg));
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(j, Load));
         }
 
         // If parameter size outsides register area, "offset" value
         // helps us to calculate stack slot for remained part properly.
         offset = RegEnd - RegBegin;
 
         CCInfo.nextInRegsParam();
       }
 
       if (Flags.getByValSize() > 4*offset) {
         auto PtrVT = getPointerTy(DAG.getDataLayout());
         unsigned LocMemOffset = VA.getLocMemOffset();
         SDValue StkPtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
         SDValue Dst = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, StkPtrOff);
         SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
                                            MVT::i32);
         SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
                                             MVT::i32);
 
         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
                                           Ops));
       }
     } else if (!isSibCall) {
       assert(VA.isMemLoc());
 
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                              dl, DAG, VA, Flags));
     }
   }
 
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
   // Tail call byval lowering might overwrite argument registers so in case of
   // tail call optimization the copies to registers are lowered later.
   if (!isTailCall)
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                                RegsToPass[i].second, InFlag);
       InFlag = Chain.getValue(1);
     }
 
   // For tail calls lower the arguments to the 'real' stack slot.
   if (isTailCall) {
     // Force all the incoming stack arguments to be loaded from the stack
     // before any new outgoing arguments are stored to the stack, because the
     // outgoing stack slots may alias the incoming argument stack slots, and
     // the alias isn't otherwise explicit. This is slightly more conservative
     // than necessary, because it means that each store effectively depends
     // on every argument instead of just those arguments it would clobber.
 
     // Do not flag preceding copytoreg stuff together with the following stuff.
     InFlag = SDValue();
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                                RegsToPass[i].second, InFlag);
       InFlag = Chain.getValue(1);
     }
     InFlag = SDValue();
   }
 
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
   bool isDirect = false;
 
   const TargetMachine &TM = getTargetMachine();
   const Module *Mod = MF.getFunction().getParent();
   const GlobalValue *GV = nullptr;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     GV = G->getGlobal();
   bool isStub =
       !TM.shouldAssumeDSOLocal(*Mod, GV) && Subtarget->isTargetMachO();
 
   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
   bool isLocalARMFunc = false;
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   auto PtrVt = getPointerTy(DAG.getDataLayout());
 
   if (Subtarget->genLongCalls()) {
     assert((!isPositionIndependent() || Subtarget->isTargetWindows()) &&
            "long-calls codegen is not position independent!");
     // Handle a global address or an external symbol. If it's not one of
     // those, the target's already in a register, so we don't need to do
     // anything extra.
     if (isa<GlobalAddressSDNode>(Callee)) {
       // Create a constant pool entry for the callee address
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
       ARMConstantPoolValue *CPV =
         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
 
       // Get the address of the callee into a register
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
 
       // Create a constant pool entry for the callee address
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
       ARMConstantPoolValue *CPV =
         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
                                       ARMPCLabelIndex, 0);
       // Get the address of the callee into a register
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     }
   } else if (isa<GlobalAddressSDNode>(Callee)) {
     // If we're optimizing for minimum size and the function is called three or
     // more times in this block, we can improve codesize by calling indirectly
     // as BLXr has a 16-bit encoding.
     auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
     auto *BB = CLI.CS.getParent();
     bool PreferIndirect =
         Subtarget->isThumb() && MF.getFunction().optForMinSize() &&
         count_if(GV->users(), [&BB](const User *U) {
           return isa<Instruction>(U) && cast<Instruction>(U)->getParent() == BB;
         }) > 2;
 
     if (!PreferIndirect) {
       isDirect = true;
       bool isDef = GV->isStrongDefinitionForLinker();
 
       // ARM call to a local ARM function is predicable.
       isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
       // tBX takes a register source operand.
       if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
         assert(Subtarget->isTargetMachO() && "WrapperPIC use on non-MachO?");
         Callee = DAG.getNode(
             ARMISD::WrapperPIC, dl, PtrVt,
             DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, ARMII::MO_NONLAZY));
         Callee = DAG.getLoad(
             PtrVt, dl, DAG.getEntryNode(), Callee,
             MachinePointerInfo::getGOT(DAG.getMachineFunction()),
             /* Alignment = */ 0, MachineMemOperand::MODereferenceable |
                                      MachineMemOperand::MOInvariant);
       } else if (Subtarget->isTargetCOFF()) {
         assert(Subtarget->isTargetWindows() &&
                "Windows is the only supported COFF target");
         unsigned TargetFlags = GV->hasDLLImportStorageClass()
                                    ? ARMII::MO_DLLIMPORT
                                    : ARMII::MO_NO_FLAG;
         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, /*Offset=*/0,
                                             TargetFlags);
         if (GV->hasDLLImportStorageClass())
           Callee =
               DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
                           DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
                           MachinePointerInfo::getGOT(DAG.getMachineFunction()));
       } else {
         Callee = DAG.getTargetGlobalAddress(GV, dl, PtrVt, 0, 0);
       }
     }
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     isDirect = true;
     // tBX takes a register source operand.
     const char *Sym = S->getSymbol();
     if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
       unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
       ARMConstantPoolValue *CPV =
         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
                                       ARMPCLabelIndex, 4);
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
       Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
     } else {
       Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
     }
   }
 
   // FIXME: handle tail calls differently.
   unsigned CallOpc;
   if (Subtarget->isThumb()) {
     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
     else
       CallOpc = ARMISD::CALL;
   } else {
     if (!isDirect && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
     else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
              // Emit regular call when code size is the priority
              !MF.getFunction().optForMinSize())
       // "mov lr, pc; b _foo" to avoid confusing the RSP
       CallOpc = ARMISD::CALL_NOLINK;
     else
       CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
   }
 
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
   if (!isTailCall) {
     const uint32_t *Mask;
     const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
     if (isThisReturn) {
       // For 'this' returns, use the R0-preserving mask if applicable
       Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
       if (!Mask) {
         // Set isThisReturn to false if the calling convention is not one that
         // allows 'returned' to be modeled in this way, so LowerCallResult does
         // not try to pass 'this' straight through
         isThisReturn = false;
         Mask = ARI->getCallPreservedMask(MF, CallConv);
       }
     } else
       Mask = ARI->getCallPreservedMask(MF, CallConv);
 
     assert(Mask && "Missing call preserved mask for calling convention");
     Ops.push_back(DAG.getRegisterMask(Mask));
   }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   if (isTailCall) {
     MF.getFrameInfo().setHasTailCall();
     return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
   }
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, dl, true),
                              DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
   return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
                          InVals, isThisReturn,
                          isThisReturn ? OutVals[0] : SDValue());
 }
 
 /// HandleByVal - Every parameter *after* a byval parameter is passed
 /// on the stack.  Remember the next parameter register to allocate,
 /// and then confiscate the rest of the parameter registers to insure
 /// this.
 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
                                     unsigned Align) const {
   // Byval (as with any stack) slots are always at least 4 byte aligned.
   Align = std::max(Align, 4U);
 
   unsigned Reg = State->AllocateReg(GPRArgRegs);
   if (!Reg)
     return;
 
   unsigned AlignInRegs = Align / 4;
   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
   for (unsigned i = 0; i < Waste; ++i)
     Reg = State->AllocateReg(GPRArgRegs);
 
   if (!Reg)
     return;
 
   unsigned Excess = 4 * (ARM::R4 - Reg);
 
   // Special case when NSAA != SP and parameter size greater than size of
   // all remained GPR regs. In that case we can't split parameter, we must
   // send it to stack. We also must set NCRN to R4, so waste all
   // remained registers.
   const unsigned NSAAOffset = State->getNextStackOffset();
   if (NSAAOffset != 0 && Size > Excess) {
     while (State->AllocateReg(GPRArgRegs))
       ;
     return;
   }
 
   // First register for byval parameter is the first register that wasn't
   // allocated before this method call, so it would be "reg".
   // If parameter is small enough to be saved in range [reg, r4), then
   // the end (first after last) register would be reg + param-size-in-regs,
   // else parameter would be splitted between registers and stack,
   // end register would be r4 in this case.
   unsigned ByValRegBegin = Reg;
   unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
   State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
   // Note, first register is allocated in the beginning of function already,
   // allocate remained amount of registers we need.
   for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
     State->AllocateReg(GPRArgRegs);
   // A byval parameter that is split between registers and memory needs its
   // size truncated here.
   // In the case where the entire structure fits in registers, we set the
   // size in memory to zero.
   Size = std::max<int>(Size - Excess, 0);
 }
 
 /// MatchingStackOffset - Return true if the given stack call argument is
 /// already available in the same position (relatively) of the caller's
 /// incoming argument stack.
 static
 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
                          MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
                          const TargetInstrInfo *TII) {
   unsigned Bytes = Arg.getValueSizeInBits() / 8;
   int FI = std::numeric_limits<int>::max();
   if (Arg.getOpcode() == ISD::CopyFromReg) {
     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
     if (!TargetRegisterInfo::isVirtualRegister(VR))
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
     if (!Def)
       return false;
     if (!Flags.isByVal()) {
       if (!TII->isLoadFromStackSlot(*Def, FI))
         return false;
     } else {
       return false;
     }
   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
     if (Flags.isByVal())
       // ByVal argument is passed in as a pointer but it's now being
       // dereferenced. e.g.
       // define @foo(%struct.X* %A) {
       //   tail call @bar(%struct.X* byval %A)
       // }
       return false;
     SDValue Ptr = Ld->getBasePtr();
     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
     if (!FINode)
       return false;
     FI = FINode->getIndex();
   } else
     return false;
 
   assert(FI != std::numeric_limits<int>::max());
   if (!MFI.isFixedObjectIndex(FI))
     return false;
   return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
 }
 
 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
 /// for tail call optimization. Targets which want to do tail call
 /// optimization should implement this function.
 bool
 ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                                      CallingConv::ID CalleeCC,
                                                      bool isVarArg,
                                                      bool isCalleeStructRet,
                                                      bool isCallerStructRet,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                                      SelectionDAG& DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const Function &CallerF = MF.getFunction();
   CallingConv::ID CallerCC = CallerF.getCallingConv();
 
   assert(Subtarget->supportsTailCall());
 
   // Tail calls to function pointers cannot be optimized for Thumb1 if the args
   // to the call take up r0-r3. The reason is that there are no legal registers
   // left to hold the pointer to the function to be called.
   if (Subtarget->isThumb1Only() && Outs.size() >= 4 &&
       !isa<GlobalAddressSDNode>(Callee.getNode()))
       return false;
 
   // Look for obvious safe cases to perform tail call optimization that do not
   // require ABI changes. This is what gcc calls sibcall.
 
   // Exception-handling functions need a special set of instructions to indicate
   // a return to the hardware. Tail-calling another function would probably
   // break this.
   if (CallerF.hasFnAttribute("interrupt"))
     return false;
 
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
   if (isCalleeStructRet || isCallerStructRet)
     return false;
 
   // Externally-defined functions with weak linkage should not be
   // tail-called on ARM when the OS does not support dynamic
   // pre-emption of symbols, as the AAELF spec requires normal calls
   // to undefined weak functions to be replaced with a NOP or jump to the
   // next instruction. The behaviour of branch instructions in this
   // situation (as used for tail calls) is implementation-defined, so we
   // cannot rely on the linker replacing the tail call with a return.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
     const Triple &TT = getTargetMachine().getTargetTriple();
     if (GV->hasExternalWeakLinkage() &&
         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
       return false;
   }
 
   // Check that the call results are passed in the same way.
   LLVMContext &C = *DAG.getContext();
   if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
                                   CCAssignFnForReturn(CalleeCC, isVarArg),
                                   CCAssignFnForReturn(CallerCC, isVarArg)))
     return false;
   // The callee has to preserve all registers the caller needs to preserve.
   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
   if (CalleeCC != CallerCC) {
     const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
     if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
       return false;
   }
 
   // If Caller's vararg or byval argument has been split between registers and
   // stack, do not perform tail call, since part of the argument is in caller's
   // local frame.
   const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
   if (AFI_Caller->getArgRegsSaveSize())
     return false;
 
   // If the callee takes no arguments then go on to check the results of the
   // call.
   if (!Outs.empty()) {
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
     if (CCInfo.getNextStackOffset()) {
       // Check if the arguments are already laid out in the right way as
       // the caller's fixed stack objects.
       MachineFrameInfo &MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
       const TargetInstrInfo *TII = Subtarget->getInstrInfo();
       for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
            i != e;
            ++i, ++realArgIdx) {
         CCValAssign &VA = ArgLocs[i];
         EVT RegVT = VA.getLocVT();
         SDValue Arg = OutVals[realArgIdx];
         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
         if (VA.getLocInfo() == CCValAssign::Indirect)
           return false;
         if (VA.needsCustom()) {
           // f64 and vector types are split into multiple registers or
           // register/stack-slot combinations.  The types will not match
           // the registers; give up on memory f64 refs until we figure
           // out what to do about this.
           if (!VA.isRegLoc())
             return false;
           if (!ArgLocs[++i].isRegLoc())
             return false;
           if (RegVT == MVT::v2f64) {
             if (!ArgLocs[++i].isRegLoc())
               return false;
             if (!ArgLocs[++i].isRegLoc())
               return false;
           }
         } else if (!VA.isRegLoc()) {
           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
                                    MFI, MRI, TII))
             return false;
         }
       }
     }
 
     const MachineRegisterInfo &MRI = MF.getRegInfo();
     if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
       return false;
   }
 
   return true;
 }
 
 bool
 ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                   MachineFunction &MF, bool isVarArg,
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                                   LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
 }
 
 static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
                                     const SDLoc &DL, SelectionDAG &DAG) {
   const MachineFunction &MF = DAG.getMachineFunction();
   const Function &F = MF.getFunction();
 
   StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
 
   // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
   // version of the "preferred return address". These offsets affect the return
   // instruction if this is a return from PL1 without hypervisor extensions.
   //    IRQ/FIQ: +4     "subs pc, lr, #4"
   //    SWI:     0      "subs pc, lr, #0"
   //    ABORT:   +4     "subs pc, lr, #4"
   //    UNDEF:   +4/+2  "subs pc, lr, #0"
   // UNDEF varies depending on where the exception came from ARM or Thumb
   // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
 
   int64_t LROffset;
   if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
       IntKind == "ABORT")
     LROffset = 4;
   else if (IntKind == "SWI" || IntKind == "UNDEF")
     LROffset = 0;
   else
     report_fatal_error("Unsupported interrupt attribute. If present, value "
                        "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
 
   RetOps.insert(RetOps.begin() + 1,
                 DAG.getConstant(LROffset, DL, MVT::i32, false));
 
   return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
 }
 
 SDValue
 ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
                                const SDLoc &dl, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of the return value to a location.
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slots.
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
 
   // Analyze outgoing return values.
   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
   bool isLittleEndian = Subtarget->isLittle();
 
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   AFI->setReturnRegsCount(RVLocs.size());
 
   // Copy the result values into the output registers.
   for (unsigned i = 0, realRVLocIdx = 0;
        i != RVLocs.size();
        ++i, ++realRVLocIdx) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
     SDValue Arg = OutVals[realRVLocIdx];
     bool ReturnF16 = false;
 
     if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
       // Half-precision return values can be returned like this:
       //
       // t11 f16 = fadd ...
       // t12: i16 = bitcast t11
       //   t13: i32 = zero_extend t12
       // t14: f32 = bitcast t13  <~~~~~~~ Arg
       //
       // to avoid code generation for bitcasts, we simply set Arg to the node
       // that produces the f16 value, t11 in this case.
       //
       if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
         SDValue ZE = Arg.getOperand(0);
         if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
           SDValue BC = ZE.getOperand(0);
           if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
             Arg = BC.getOperand(0);
             ReturnF16 = true;
           }
         }
       }
     }
 
     switch (VA.getLocInfo()) {
     default: llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full: break;
     case CCValAssign::BCvt:
       if (!ReturnF16)
         Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
       break;
     }
 
     if (VA.needsCustom()) {
       if (VA.getLocVT() == MVT::v2f64) {
         // Extract the first half and return it in two registers.
         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
                                    DAG.getConstant(0, dl, MVT::i32));
         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
 
         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
                                  HalfGPRs.getValue(isLittleEndian ? 0 : 1),
                                  Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
                                  HalfGPRs.getValue(isLittleEndian ? 1 : 0),
                                  Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
 
         // Extract the 2nd half and fall through to handle it as an f64 value.
         Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
                           DAG.getConstant(1, dl, MVT::i32));
       }
       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
       // available.
       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
                                fmrrd.getValue(isLittleEndian ? 0 : 1),
                                Flag);
       Flag = Chain.getValue(1);
       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
       VA = RVLocs[++i]; // skip ahead to next loc
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
                                fmrrd.getValue(isLittleEndian ? 1 : 0),
                                Flag);
     } else
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
 
     // Guarantee that all emitted copies are
     // stuck together, avoiding something bad.
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(),
                                      ReturnF16 ? MVT::f16 : VA.getLocVT()));
   }
   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const MCPhysReg *I =
       TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
   if (I) {
     for (; *I; ++I) {
       if (ARM::GPRRegClass.contains(*I))
         RetOps.push_back(DAG.getRegister(*I, MVT::i32));
       else if (ARM::DPRRegClass.contains(*I))
         RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
       else
         llvm_unreachable("Unexpected register class in CSRsViaCopy!");
     }
   }
 
   // Update chain and glue.
   RetOps[0] = Chain;
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
   // CPUs which aren't M-class use a special sequence to return from
   // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
   // though we use "subs pc, lr, #N").
   //
   // M-class CPUs actually use a normal return sequence with a special
   // (hardware-provided) value in LR, so the normal code path works.
   if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
       !Subtarget->isMClass()) {
     if (Subtarget->isThumb1Only())
       report_fatal_error("interrupt attribute is not supported in Thumb1");
     return LowerInterruptReturn(RetOps, dl, DAG);
   }
 
   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   if (N->getNumValues() != 1)
     return false;
   if (!N->hasNUsesOfValue(1, 0))
     return false;
 
   SDValue TCChain = Chain;
   SDNode *Copy = *N->use_begin();
   if (Copy->getOpcode() == ISD::CopyToReg) {
     // If the copy has a glue operand, we conservatively assume it isn't safe to
     // perform a tail call.
     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
       return false;
     TCChain = Copy->getOperand(0);
   } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
     SDNode *VMov = Copy;
     // f64 returned in a pair of GPRs.
     SmallPtrSet<SDNode*, 2> Copies;
     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
          UI != UE; ++UI) {
       if (UI->getOpcode() != ISD::CopyToReg)
         return false;
       Copies.insert(*UI);
     }
     if (Copies.size() > 2)
       return false;
 
     for (SDNode::use_iterator UI = VMov->use_begin(), UE = VMov->use_end();
          UI != UE; ++UI) {
       SDValue UseChain = UI->getOperand(0);
       if (Copies.count(UseChain.getNode()))
         // Second CopyToReg
         Copy = *UI;
       else {
         // We are at the top of this chain.
         // If the copy has a glue operand, we conservatively assume it
         // isn't safe to perform a tail call.
         if (UI->getOperand(UI->getNumOperands()-1).getValueType() == MVT::Glue)
           return false;
         // First CopyToReg
         TCChain = UseChain;
       }
     }
   } else if (Copy->getOpcode() == ISD::BITCAST) {
     // f32 returned in a single GPR.
     if (!Copy->hasOneUse())
       return false;
     Copy = *Copy->use_begin();
     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
       return false;
     // If the copy has a glue operand, we conservatively assume it isn't safe to
     // perform a tail call.
     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
       return false;
     TCChain = Copy->getOperand(0);
   } else {
     return false;
   }
 
   bool HasRet = false;
   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
        UI != UE; ++UI) {
     if (UI->getOpcode() != ARMISD::RET_FLAG &&
         UI->getOpcode() != ARMISD::INTRET_FLAG)
       return false;
     HasRet = true;
   }
 
   if (!HasRet)
     return false;
 
   Chain = TCChain;
   return true;
 }
 
 bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   if (!Subtarget->supportsTailCall())
     return false;
 
   auto Attr =
       CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
   if (!CI->isTailCall() || Attr.getValueAsString() == "true")
     return false;
 
   return true;
 }
 
 // Trying to write a 64 bit value so need to split into two 32 bit values first,
 // and pass the lower and high parts through.
 static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   SDValue WriteValue = Op->getOperand(2);
 
   // This function is only supposed to be called for i64 type argument.
   assert(WriteValue.getValueType() == MVT::i64
           && "LowerWRITE_REGISTER called for non-i64 type argument.");
 
   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
                            DAG.getConstant(0, DL, MVT::i32));
   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, WriteValue,
                            DAG.getConstant(1, DL, MVT::i32));
   SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
   return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
 }
 
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
 // their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
 // one of the above mentioned nodes. It has to be wrapped because otherwise
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
 // into MOVi.
 SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
                                              SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   // FIXME there is no actual debug info here
   SDLoc dl(Op);
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   SDValue Res;
 
   // When generating execute-only code Constant Pools must be promoted to the
   // global data section. It's a bit ugly that we can't share them across basic
   // blocks, but this way we guarantee that execute-only behaves correct with
   // position-independent addressing modes.
   if (Subtarget->genExecuteOnly()) {
     auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
     auto T = const_cast<Type*>(CP->getType());
     auto C = const_cast<Constant*>(CP->getConstVal());
     auto M = const_cast<Module*>(DAG.getMachineFunction().
                                  getFunction().getParent());
     auto GV = new GlobalVariable(
                     *M, T, /*isConst=*/true, GlobalVariable::InternalLinkage, C,
                     Twine(DAG.getDataLayout().getPrivateGlobalPrefix()) + "CP" +
                     Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
                     Twine(AFI->createPICLabelUId())
                   );
     SDValue GA = DAG.getTargetGlobalAddress(dyn_cast<GlobalValue>(GV),
                                             dl, PtrVT);
     return LowerGlobalAddress(GA, DAG);
   }
 
   if (CP->isMachineConstantPoolEntry())
     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
                                     CP->getAlignment());
   else
     Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
                                     CP->getAlignment());
   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
 }
 
 unsigned ARMTargetLowering::getJumpTableEncoding() const {
   return MachineJumpTableInfo::EK_Inline;
 }
 
 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
                                              SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = 0;
   SDLoc DL(Op);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   SDValue CPAddr;
   bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
   if (!IsPositionIndependent) {
     CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
   } else {
     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
     ARMPCLabelIndex = AFI->createPICLabelUId();
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
                                       ARMCP::CPBlockAddress, PCAdj);
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
   SDValue Result = DAG.getLoad(
       PtrVT, DL, DAG.getEntryNode(), CPAddr,
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   if (!IsPositionIndependent)
     return Result;
   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
 }
 
 /// Convert a TLS address reference into the correct sequence of loads
 /// and calls to compute the variable's address for Darwin, and return an
 /// SDValue containing the final node.
 
 /// Darwin only has one TLS scheme which must be capable of dealing with the
 /// fully general situation, in the worst case. This means:
 ///     + "extern __thread" declaration.
 ///     + Defined in a possibly unknown dynamic library.
 ///
 /// The general system is that each __thread variable has a [3 x i32] descriptor
 /// which contains information used by the runtime to calculate the address. The
 /// only part of this the compiler needs to know about is the first word, which
 /// contains a function pointer that must be called with the address of the
 /// entire descriptor in "r0".
 ///
 /// Since this descriptor may be in a different unit, in general access must
 /// proceed along the usual ARM rules. A common sequence to produce is:
 ///
 ///     movw rT1, :lower16:_var$non_lazy_ptr
 ///     movt rT1, :upper16:_var$non_lazy_ptr
 ///     ldr r0, [rT1]
 ///     ldr rT2, [r0]
 ///     blx rT2
 ///     [...address now in r0...]
 SDValue
 ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
                                                SelectionDAG &DAG) const {
   assert(Subtarget->isTargetDarwin() &&
          "This function expects a Darwin target");
   SDLoc DL(Op);
 
   // First step is to get the address of the actua global symbol. This is where
   // the TLS descriptor lives.
   SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
 
   // The first entry in the descriptor is a function pointer that we must call
   // to obtain the address of the variable.
   SDValue Chain = DAG.getEntryNode();
   SDValue FuncTLVGet = DAG.getLoad(
       MVT::i32, DL, Chain, DescAddr,
       MachinePointerInfo::getGOT(DAG.getMachineFunction()),
       /* Alignment = */ 4,
       MachineMemOperand::MONonTemporal | MachineMemOperand::MODereferenceable |
           MachineMemOperand::MOInvariant);
   Chain = FuncTLVGet.getValue(1);
 
   MachineFunction &F = DAG.getMachineFunction();
   MachineFrameInfo &MFI = F.getFrameInfo();
   MFI.setAdjustsStack(true);
 
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
   // silly).
   auto TRI =
       getTargetMachine().getSubtargetImpl(F.getFunction())->getRegisterInfo();
   auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
   const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
 
   // Finally, we can make the call. This is just a degenerate version of a
   // normal AArch64 call node: r0 takes the address of the descriptor, and
   // returns the address of the variable in this thread.
   Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
   Chain =
       DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
                   Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
                   DAG.getRegisterMask(Mask), Chain.getValue(1));
   return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
 }
 
 SDValue
 ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
                                                 SelectionDAG &DAG) const {
   assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
 
   SDValue Chain = DAG.getEntryNode();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(Op);
 
   // Load the current TEB (thread environment block)
   SDValue Ops[] = {Chain,
                    DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
                    DAG.getConstant(15, DL, MVT::i32),
                    DAG.getConstant(0, DL, MVT::i32),
                    DAG.getConstant(13, DL, MVT::i32),
                    DAG.getConstant(0, DL, MVT::i32),
                    DAG.getConstant(2, DL, MVT::i32)};
   SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
                                    DAG.getVTList(MVT::i32, MVT::Other), Ops);
 
   SDValue TEB = CurrentTEB.getValue(0);
   Chain = CurrentTEB.getValue(1);
 
   // Load the ThreadLocalStoragePointer from the TEB
   // A pointer to the TLS array is located at offset 0x2c from the TEB.
   SDValue TLSArray =
       DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
   TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
 
   // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
   // offset into the TLSArray.
 
   // Load the TLS index from the C runtime
   SDValue TLSIndex =
       DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
   TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
   TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
 
   SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
                               DAG.getConstant(2, DL, MVT::i32));
   SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
                             DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
                             MachinePointerInfo());
 
   // Get the offset of the start of the .tls section (section base)
   const auto *GA = cast<GlobalAddressSDNode>(Op);
   auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
   SDValue Offset = DAG.getLoad(
       PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
                                     DAG.getTargetConstantPool(CPV, PtrVT, 4)),
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
 
   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
 }
 
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
 SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                                  SelectionDAG &DAG) const {
   SDLoc dl(GA);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
   ARMConstantPoolValue *CPV =
     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
   SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
   Argument = DAG.getLoad(
       PtrVT, dl, DAG.getEntryNode(), Argument,
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   SDValue Chain = Argument.getValue(1);
 
   SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
   Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
 
   // call __tls_get_addr.
   ArgListTy Args;
   ArgListEntry Entry;
   Entry.Node = Argument;
   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
   Args.push_back(Entry);
 
   // FIXME: is there useful debug info available here?
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
       CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
       DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
 }
 
 // Lower ISD::GlobalTLSAddress using the "initial exec" or
 // "local exec" model.
 SDValue
 ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                                         SelectionDAG &DAG,
                                         TLSModel::Model model) const {
   const GlobalValue *GV = GA->getGlobal();
   SDLoc dl(GA);
   SDValue Offset;
   SDValue Chain = DAG.getEntryNode();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   // Get the Thread Pointer
   SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
 
   if (model == TLSModel::InitialExec) {
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
     // Initial exec model.
     unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
                                       true);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     Chain = Offset.getValue(1);
 
     SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
     Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
 
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   } else {
     // local exec model
     assert(model == TLSModel::LocalExec);
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
     Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   }
 
   // The address of the thread local variable is the add of the thread
   // pointer with the offset of the variable.
   return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
 }
 
 SDValue
 ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
   if (DAG.getTarget().useEmulatedTLS())
     return LowerToTLSEmulatedModel(GA, DAG);
 
   if (Subtarget->isTargetDarwin())
     return LowerGlobalTLSAddressDarwin(Op, DAG);
 
   if (Subtarget->isTargetWindows())
     return LowerGlobalTLSAddressWindows(Op, DAG);
 
   // TODO: implement the "local dynamic" model
   assert(Subtarget->isTargetELF() && "Only ELF implemented here");
   TLSModel::Model model = getTargetMachine().getTLSModel(GA->getGlobal());
 
   switch (model) {
     case TLSModel::GeneralDynamic:
     case TLSModel::LocalDynamic:
       return LowerToTLSGeneralDynamicModel(GA, DAG);
     case TLSModel::InitialExec:
     case TLSModel::LocalExec:
       return LowerToTLSExecModels(GA, DAG, model);
   }
   llvm_unreachable("bogus TLS model");
 }
 
 /// Return true if all users of V are within function F, looking through
 /// ConstantExprs.
 static bool allUsersAreInFunction(const Value *V, const Function *F) {
   SmallVector<const User*,4> Worklist;
   for (auto *U : V->users())
     Worklist.push_back(U);
   while (!Worklist.empty()) {
     auto *U = Worklist.pop_back_val();
     if (isa<ConstantExpr>(U)) {
       for (auto *UU : U->users())
         Worklist.push_back(UU);
       continue;
     }
 
     auto *I = dyn_cast<Instruction>(U);
     if (!I || I->getParent()->getParent() != F)
       return false;
   }
   return true;
 }
 
 /// Return true if all users of V are within some (any) function, looking through
 /// ConstantExprs. In other words, are there any global constant users?
 static bool allUsersAreInFunctions(const Value *V) {
   SmallVector<const User*,4> Worklist;
   for (auto *U : V->users())
     Worklist.push_back(U);
   while (!Worklist.empty()) {
     auto *U = Worklist.pop_back_val();
     if (isa<ConstantExpr>(U)) {
       for (auto *UU : U->users())
         Worklist.push_back(UU);
       continue;
     }
 
     if (!isa<Instruction>(U))
       return false;
   }
   return true;
 }
 
 // Return true if T is an integer, float or an array/vector of either.
 static bool isSimpleType(Type *T) {
   if (T->isIntegerTy() || T->isFloatingPointTy())
     return true;
   Type *SubT = nullptr;
   if (T->isArrayTy())
     SubT = T->getArrayElementType();
   else if (T->isVectorTy())
     SubT = T->getVectorElementType();
   else
     return false;
   return SubT->isIntegerTy() || SubT->isFloatingPointTy();
 }
 
 static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
                                      EVT PtrVT, const SDLoc &dl) {
   // If we're creating a pool entry for a constant global with unnamed address,
   // and the global is small enough, we can emit it inline into the constant pool
   // to save ourselves an indirection.
   //
   // This is a win if the constant is only used in one function (so it doesn't
   // need to be duplicated) or duplicating the constant wouldn't increase code
   // size (implying the constant is no larger than 4 bytes).
   const Function &F = DAG.getMachineFunction().getFunction();
 
   // We rely on this decision to inline being idemopotent and unrelated to the
   // use-site. We know that if we inline a variable at one use site, we'll
   // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
   // doesn't know about this optimization, so bail out if it's enabled else
   // we could decide to inline here (and thus never emit the GV) but require
   // the GV from fast-isel generated code.
   if (!EnableConstpoolPromotion ||
       DAG.getMachineFunction().getTarget().Options.EnableFastISel)
       return SDValue();
 
   auto *GVar = dyn_cast<GlobalVariable>(GV);
   if (!GVar || !GVar->hasInitializer() ||
       !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
       !GVar->hasLocalLinkage())
     return SDValue();
 
   // Ensure that we don't try and inline any type that contains pointers. If
   // we inline a value that contains relocations, we move the relocations from
   // .data to .text which is not ideal.
   auto *Init = GVar->getInitializer();
   if (!isSimpleType(Init->getType()))
     return SDValue();
 
   // The constant islands pass can only really deal with alignment requests
   // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
   // any type wanting greater alignment requirements than 4 bytes. We also
   // can only promote constants that are multiples of 4 bytes in size or
   // are paddable to a multiple of 4. Currently we only try and pad constants
   // that are strings for simplicity.
   auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
   unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
   unsigned Align = GVar->getAlignment();
   unsigned RequiredPadding = 4 - (Size % 4);
   bool PaddingPossible =
     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
   if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
       Size == 0)
     return SDValue();
 
   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   // We can't bloat the constant pool too much, else the ConstantIslands pass
   // may fail to converge. If we haven't promoted this global yet (it may have
   // multiple uses), and promoting it would increase the constant pool size (Sz
   // > 4), ensure we have space to do so up to MaxTotal.
   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
     if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
         ConstpoolPromotionMaxTotal)
       return SDValue();
 
   // This is only valid if all users are in a single function OR it has users
   // in multiple functions but it no larger than a pointer. We also check if
   // GVar has constant (non-ConstantExpr) users. If so, it essentially has its
   // address taken.
   if (!allUsersAreInFunction(GVar, &F) &&
       !(Size <= 4 && allUsersAreInFunctions(GVar)))
     return SDValue();
 
   // We're going to inline this global. Pad it out if needed.
   if (RequiredPadding != 4) {
     StringRef S = CDAInit->getAsString();
 
     SmallVector<uint8_t,16> V(S.size());
     std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
     while (RequiredPadding--)
       V.push_back(0);
     Init = ConstantDataArray::get(*DAG.getContext(), V);
   }
 
   auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
   SDValue CPAddr =
     DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
     AFI->markGlobalAsPromotedToConstantPool(GVar);
     AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
                                       PaddedSize - 4);
   }
   ++NumConstpoolPromoted;
   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
 }
 
 bool ARMTargetLowering::isReadOnly(const GlobalValue *GV) const {
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
     GV = GA->getBaseObject();
   return (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
          isa<Function>(GV);
 }
 
 SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
   switch (Subtarget->getTargetTriple().getObjectFormat()) {
   default: llvm_unreachable("unknown object format");
   case Triple::COFF:
     return LowerGlobalAddressWindows(Op, DAG);
   case Triple::ELF:
     return LowerGlobalAddressELF(Op, DAG);
   case Triple::MachO:
     return LowerGlobalAddressDarwin(Op, DAG);
   }
 }
 
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   const TargetMachine &TM = getTargetMachine();
   bool IsRO = isReadOnly(GV);
 
   // promoteToConstantPool only if not generating XO text section
   if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
     if (SDValue V = promoteToConstantPool(GV, DAG, PtrVT, dl))
       return V;
 
   if (isPositionIndependent()) {
     bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
                                            UseGOT_PREL ? ARMII::MO_GOT : 0);
     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
     if (UseGOT_PREL)
       Result =
           DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
     return Result;
   } else if (Subtarget->isROPI() && IsRO) {
     // PC-relative.
     SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
     SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
     return Result;
   } else if (Subtarget->isRWPI() && !IsRO) {
     // SB-relative.
     SDValue RelAddr;
     if (Subtarget->useMovt(DAG.getMachineFunction())) {
       ++NumMovwMovt;
       SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
       RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
     } else { // use literal pool for address constant
       ARMConstantPoolValue *CPV =
         ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
       SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       RelAddr = DAG.getLoad(
           PtrVT, dl, DAG.getEntryNode(), CPAddr,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
     }
     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
     SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
     return Result;
   }
 
   // If we have T2 ops, we can materialize the address directly via movt/movw
   // pair. This is always cheaper.
   if (Subtarget->useMovt(DAG.getMachineFunction())) {
     ++NumMovwMovt;
     // FIXME: Once remat is capable of dealing with instructions with register
     // operands, expand this into two nodes.
     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
   } else {
     SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     return DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
   }
 }
 
 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
                                                     SelectionDAG &DAG) const {
   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
          "ROPI/RWPI not currently supported for Darwin");
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
   if (Subtarget->useMovt(DAG.getMachineFunction()))
     ++NumMovwMovt;
 
   // FIXME: Once remat is capable of dealing with instructions with register
   // operands, expand this into multiple nodes
   unsigned Wrapper =
       isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
 
   SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
   SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
 
   if (Subtarget->isGVIndirectSymbol(GV))
     Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   return Result;
 }
 
 SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
                                                      SelectionDAG &DAG) const {
   assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
   assert(Subtarget->useMovt(DAG.getMachineFunction()) &&
          "Windows on ARM expects to use movw/movt");
   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
          "ROPI/RWPI not currently supported for Windows");
 
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   const ARMII::TOF TargetFlags =
     (GV->hasDLLImportStorageClass() ? ARMII::MO_DLLIMPORT : ARMII::MO_NO_FLAG);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result;
   SDLoc DL(Op);
 
   ++NumMovwMovt;
 
   // FIXME: Once remat is capable of dealing with instructions with register
   // operands, expand this into two nodes.
   Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*Offset=*/0,
                                                   TargetFlags));
   if (GV->hasDLLImportStorageClass())
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   return Result;
 }
 
 SDValue
 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   SDValue Val = DAG.getConstant(0, dl, MVT::i32);
   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
                      Op.getOperand(1), Val);
 }
 
 SDValue
 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
                      Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
 }
 
 SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
                                                       SelectionDAG &DAG) const {
   SDLoc dl(Op);
   return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
                      Op.getOperand(0));
 }
 
 SDValue
 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                           const ARMSubtarget *Subtarget) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
   case Intrinsic::thread_pointer: {
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
   }
   case Intrinsic::eh_sjlj_lsda: {
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
     SDValue CPAddr;
     bool IsPositionIndependent = isPositionIndependent();
     unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
                                       ARMCP::CPLSDA, PCAdj);
     CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
 
     if (IsPositionIndependent) {
       SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
       Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
     }
     return Result;
   }
   case Intrinsic::arm_neon_vabs:
     return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
                         Op.getOperand(1));
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu: {
     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
       ? ARMISD::VMULLs : ARMISD::VMULLu;
     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   }
   case Intrinsic::arm_neon_vminnm:
   case Intrinsic::arm_neon_vmaxnm: {
     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
       ? ISD::FMINNUM : ISD::FMAXNUM;
     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   }
   case Intrinsic::arm_neon_vminu:
   case Intrinsic::arm_neon_vmaxu: {
     if (Op.getValueType().isFloatingPoint())
       return SDValue();
     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
       ? ISD::UMIN : ISD::UMAX;
     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                          Op.getOperand(1), Op.getOperand(2));
   }
   case Intrinsic::arm_neon_vmins:
   case Intrinsic::arm_neon_vmaxs: {
     // v{min,max}s is overloaded between signed integers and floats.
     if (!Op.getValueType().isFloatingPoint()) {
       unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
         ? ISD::SMIN : ISD::SMAX;
       return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                          Op.getOperand(1), Op.getOperand(2));
     }
     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
       ? ISD::FMINNAN : ISD::FMAXNAN;
     return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   }
   case Intrinsic::arm_neon_vtbl1:
     return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::arm_neon_vtbl2:
     return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   }
 }
 
 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
                                  const ARMSubtarget *Subtarget) {
   SDLoc dl(Op);
   ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2));
   auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue());
   if (SSID == SyncScope::SingleThread)
     return Op;
 
   if (!Subtarget->hasDataBarrier()) {
     // Some ARMv6 cpus can support data barriers with an mcr instruction.
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
     // here.
     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
            "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
                        DAG.getConstant(0, dl, MVT::i32));
   }
 
   ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
   AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
   ARM_MB::MemBOpt Domain = ARM_MB::ISH;
   if (Subtarget->isMClass()) {
     // Only a full system barrier exists in the M-class architectures.
     Domain = ARM_MB::SY;
   } else if (Subtarget->preferISHSTBarriers() &&
              Ord == AtomicOrdering::Release) {
     // Swift happens to implement ISHST barriers in a way that's compatible with
     // Release semantics but weaker than ISH so we'd be fools not to use
     // it. Beware: other processors probably don't!
     Domain = ARM_MB::ISHST;
   }
 
   return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
                      DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
                      DAG.getConstant(Domain, dl, MVT::i32));
 }
 
 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
                              const ARMSubtarget *Subtarget) {
   // ARM pre v5TE and Thumb1 does not have preload instructions.
   if (!(Subtarget->isThumb2() ||
         (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
     // Just preserve the chain.
     return Op.getOperand(0);
 
   SDLoc dl(Op);
   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
   if (!isRead &&
       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
     // ARMv7 with MP extension has PLDW.
     return Op.getOperand(0);
 
   unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
   if (Subtarget->isThumb()) {
     // Invert the bits.
     isRead = ~isRead & 1;
     isData = ~isData & 1;
   }
 
   return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
                      Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
                      DAG.getConstant(isData, dl, MVT::i32));
 }
 
 static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
 
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
   SDLoc dl(Op);
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
                       MachinePointerInfo(SV));
 }
 
 SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
                                                 CCValAssign &NextVA,
                                                 SDValue &Root,
                                                 SelectionDAG &DAG,
                                                 const SDLoc &dl) const {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   const TargetRegisterClass *RC;
   if (AFI->isThumb1OnlyFunction())
     RC = &ARM::tGPRRegClass;
   else
     RC = &ARM::GPRRegClass;
 
   // Transform the arguments stored in physical registers into virtual ones.
   unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
   SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
 
   SDValue ArgValue2;
   if (NextVA.isMemLoc()) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
     int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
 
     // Create load node to retrieve arguments from the stack.
     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     ArgValue2 = DAG.getLoad(
         MVT::i32, dl, Root, FIN,
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
   } else {
     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
   }
   if (!Subtarget->isLittle())
     std::swap (ArgValue, ArgValue2);
   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
 }
 
 // The remaining GPRs hold either the beginning of variable-argument
 // data, or the beginning of an aggregate passed by value (usually
 // byval).  Either way, we allocate stack slots adjacent to the data
 // provided by our caller, and store the unallocated registers there.
 // If this is a variadic function, the va_list pointer will begin with
 // these values; otherwise, this reassembles a (byval) structure that
 // was split between registers and memory.
 // Return: The frame index registers were stored into.
 int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
                                       const SDLoc &dl, SDValue &Chain,
                                       const Value *OrigArg,
                                       unsigned InRegsParamRecordIdx,
                                       int ArgOffset, unsigned ArgSize) const {
   // Currently, two use-cases possible:
   // Case #1. Non-var-args function, and we meet first byval parameter.
   //          Setup first unallocated register as first byval register;
   //          eat all remained registers
   //          (these two actions are performed by HandleByVal method).
   //          Then, here, we initialize stack frame with
   //          "store-reg" instructions.
   // Case #2. Var-args function, that doesn't contain byval parameters.
   //          The same: eat all remained unallocated registers,
   //          initialize stack frame.
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned RBegin, REnd;
   if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
     CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
   } else {
     unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
     RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
     REnd = ARM::R4;
   }
 
   if (REnd != RBegin)
     ArgOffset = -4 * (ARM::R4 - RBegin);
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
   SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
 
   SmallVector<SDValue, 4> MemOps;
   const TargetRegisterClass *RC =
       AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
 
   for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
     unsigned VReg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
     SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
                                  MachinePointerInfo(OrigArg, 4 * i));
     MemOps.push_back(Store);
     FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
   }
 
   if (!MemOps.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   return FrameIndex;
 }
 
 // Setup stack frame, the va_list pointer will start from.
 void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                                              const SDLoc &dl, SDValue &Chain,
                                              unsigned ArgOffset,
                                              unsigned TotalArgRegsSaveSize,
                                              bool ForceMutable) const {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   // Try to store any remaining integer argument regs
   // to their spots on the stack so that they may be loaded by dereferencing
   // the result of va_next.
   // If there is no regs to be stored, just point address after last
   // argument passed via stack.
   int FrameIndex = StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
                                   CCInfo.getInRegsParamsCount(),
                                   CCInfo.getNextStackOffset(), 4);
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
 
 SDValue ARMTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
 
   SmallVector<SDValue, 16> ArgValues;
   SDValue ArgValue;
   Function::const_arg_iterator CurOrigArg = MF.getFunction().arg_begin();
   unsigned CurArgIdx = 0;
 
   // Initially ArgRegsSaveSize is zero.
   // Then we increase this value each time we meet byval parameter.
   // We also increase this value in case of varargs function.
   AFI->setArgRegsSaveSize(0);
 
   // Calculate the amount of stack space that we need to allocate to store
   // byval and variadic arguments that are passed in registers.
   // We need to know this before we allocate the first byval or variadic
   // argument, as they will be allocated a stack slot below the CFA (Canonical
   // Frame Address, the stack pointer at entry to the function).
   unsigned ArgRegBegin = ARM::R4;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
       break;
 
     CCValAssign &VA = ArgLocs[i];
     unsigned Index = VA.getValNo();
     ISD::ArgFlagsTy Flags = Ins[Index].Flags;
     if (!Flags.isByVal())
       continue;
 
     assert(VA.isMemLoc() && "unexpected byval pointer in reg");
     unsigned RBegin, REnd;
     CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
     ArgRegBegin = std::min(ArgRegBegin, RBegin);
 
     CCInfo.nextInRegsParam();
   }
   CCInfo.rewindByValRegsInfo();
 
   int lastInsIndex = -1;
   if (isVarArg && MFI.hasVAStart()) {
     unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
     if (RegIdx != array_lengthof(GPRArgRegs))
       ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
   }
 
   unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
   AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     if (Ins[VA.getValNo()].isOrigArg()) {
       std::advance(CurOrigArg,
                    Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
       CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
     }
     // Arguments stored in registers.
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
 
       if (VA.needsCustom()) {
         // f64 and vector types are split up into multiple registers or
         // combinations of registers and stack slots.
         if (VA.getLocVT() == MVT::v2f64) {
           SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
                                                    Chain, DAG, dl);
           VA = ArgLocs[++i]; // skip ahead to next loc
           SDValue ArgValue2;
           if (VA.isMemLoc()) {
             int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
             ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
                                     MachinePointerInfo::getFixedStack(
                                         DAG.getMachineFunction(), FI));
           } else {
             ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
                                              Chain, DAG, dl);
           }
           ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
                                  ArgValue, ArgValue1,
                                  DAG.getIntPtrConstant(0, dl));
           ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
                                  ArgValue, ArgValue2,
                                  DAG.getIntPtrConstant(1, dl));
         } else
           ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
       } else {
         const TargetRegisterClass *RC;
 
 
         if (RegVT == MVT::f16)
           RC = &ARM::HPRRegClass;
         else if (RegVT == MVT::f32)
           RC = &ARM::SPRRegClass;
         else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
           RC = &ARM::DPRRegClass;
         else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
           RC = &ARM::QPRRegClass;
         else if (RegVT == MVT::i32)
           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
                                            : &ARM::GPRRegClass;
         else
           llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
         // Transform the arguments in physical registers into virtual ones.
         unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       }
 
       // If this is an 8 or 16-bit value, it is really passed promoted
       // to 32 bits.  Insert an assert[sz]ext to capture this, then
       // truncate to the right size.
       switch (VA.getLocInfo()) {
       default: llvm_unreachable("Unknown loc info!");
       case CCValAssign::Full: break;
       case CCValAssign::BCvt:
         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
         break;
       case CCValAssign::SExt:
         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
                                DAG.getValueType(VA.getValVT()));
         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
         break;
       case CCValAssign::ZExt:
         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
                                DAG.getValueType(VA.getValVT()));
         ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
         break;
       }
 
       InVals.push_back(ArgValue);
     } else { // VA.isRegLoc()
       // sanity check
       assert(VA.isMemLoc());
       assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
 
       int index = VA.getValNo();
 
       // Some Ins[] entries become multiple ArgLoc[] entries.
       // Process them only once.
       if (index != lastInsIndex)
         {
           ISD::ArgFlagsTy Flags = Ins[index].Flags;
           // FIXME: For now, all byval parameter objects are marked mutable.
           // This can be changed with more analysis.
           // In case of tail call optimization mark all arguments mutable.
           // Since they could be overwritten by lowering of arguments in case of
           // a tail call.
           if (Flags.isByVal()) {
             assert(Ins[index].isOrigArg() &&
                    "Byval arguments cannot be implicit");
             unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
 
             int FrameIndex = StoreByValRegs(
                 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
                 VA.getLocMemOffset(), Flags.getByValSize());
             InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
             CCInfo.nextInRegsParam();
           } else {
             unsigned FIOffset = VA.getLocMemOffset();
             int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
                                            FIOffset, true);
 
             // Create load nodes to retrieve arguments from the stack.
             SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
             InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
                                          MachinePointerInfo::getFixedStack(
                                              DAG.getMachineFunction(), FI)));
           }
           lastInsIndex = index;
         }
     }
   }
 
   // varargs
   if (isVarArg && MFI.hasVAStart())
     VarArgStyleRegisters(CCInfo, DAG, dl, Chain,
                          CCInfo.getNextStackOffset(),
                          TotalArgRegsSaveSize);
 
   AFI->setArgumentStackSize(CCInfo.getNextStackOffset());
 
   return Chain;
 }
 
 /// isFloatingPointZero - Return true if this is +0.0.
 static bool isFloatingPointZero(SDValue Op) {
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
     return CFP->getValueAPF().isPosZero();
   else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
     // Maybe this has already been legalized into the constant pool?
     if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
       SDValue WrapperOp = Op.getOperand(1).getOperand(0);
       if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
         if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
           return CFP->getValueAPF().isPosZero();
     }
   } else if (Op->getOpcode() == ISD::BITCAST &&
              Op->getValueType(0) == MVT::f64) {
     // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
     // created by LowerConstantFP().
     SDValue BitcastOp = Op->getOperand(0);
     if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
         isNullConstant(BitcastOp->getOperand(0)))
       return true;
   }
   return false;
 }
 
 /// Returns appropriate ARM CMP (cmp) and corresponding condition code for
 /// the given operands.
 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                                      SDValue &ARMcc, SelectionDAG &DAG,
                                      const SDLoc &dl) const {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     unsigned C = RHSC->getZExtValue();
     if (!isLegalICmpImmediate((int32_t)C)) {
       // Constant does not fit, try adjusting it by one.
       switch (CC) {
       default: break;
       case ISD::SETLT:
       case ISD::SETGE:
         if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
         }
         break;
       case ISD::SETULT:
       case ISD::SETUGE:
         if (C != 0 && isLegalICmpImmediate(C-1)) {
           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
           RHS = DAG.getConstant(C - 1, dl, MVT::i32);
         }
         break;
       case ISD::SETLE:
       case ISD::SETGT:
         if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
         }
         break;
       case ISD::SETULE:
       case ISD::SETUGT:
         if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
           RHS = DAG.getConstant(C + 1, dl, MVT::i32);
         }
         break;
       }
     }
   } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
              (ARM_AM::getShiftOpcForNode(RHS.getOpcode()) == ARM_AM::no_shift)) {
     // In ARM and Thumb-2, the compare instructions can shift their second
     // operand.
     CC = ISD::getSetCCSwappedOperands(CC);
     std::swap(LHS, RHS);
   }
 
   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
   ARMISD::NodeType CompareType;
   switch (CondCode) {
   default:
     CompareType = ARMISD::CMP;
     break;
   case ARMCC::EQ:
   case ARMCC::NE:
     // Uses only Z Flag
     CompareType = ARMISD::CMPZ;
     break;
   }
   ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
   return DAG.getNode(CompareType, dl, MVT::Glue, LHS, RHS);
 }
 
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
                                      SelectionDAG &DAG, const SDLoc &dl,
                                      bool InvalidOnQNaN) const {
   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
   SDValue Cmp;
   SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
   if (!isFloatingPointZero(RHS))
     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
   else
     Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
 }
 
 /// duplicateCmp - Glue values can have only one use, so this function
 /// duplicates a comparison node.
 SDValue
 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
   unsigned Opc = Cmp.getOpcode();
   SDLoc DL(Cmp);
   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
 
   assert(Opc == ARMISD::FMSTAT && "unexpected comparison operation");
   Cmp = Cmp.getOperand(0);
   Opc = Cmp.getOpcode();
   if (Opc == ARMISD::CMPFP)
     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
                       Cmp.getOperand(1), Cmp.getOperand(2));
   else {
     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
     Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
                       Cmp.getOperand(1));
   }
   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
 }
 
 // This function returns three things: the arithmetic computation itself
 // (Value), a comparison (OverflowCmp), and a condition code (ARMcc).  The
 // comparison and the condition code define the case in which the arithmetic
 // computation *does not* overflow.
 std::pair<SDValue, SDValue>
 ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
                                  SDValue &ARMcc) const {
   assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
 
   SDValue Value, OverflowCmp;
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDLoc dl(Op);
 
   // FIXME: We are currently always generating CMPs because we don't support
   // generating CMN through the backend. This is not as good as the natural
   // CMP case because it causes a register dependency and cannot be folded
   // later.
 
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Unknown overflow instruction!");
   case ISD::SADDO:
     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
     Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
     break;
   case ISD::UADDO:
     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
     // We use ADDC here to correspond to its use in LowerUnsignedALUO.
     // We do not use it in the USUBO case as Value may not be used.
     Value = DAG.getNode(ARMISD::ADDC, dl,
                         DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
                 .getValue(0);
     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value, LHS);
     break;
   case ISD::SSUBO:
     ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
     break;
   case ISD::USUBO:
     ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
     Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, LHS, RHS);
     break;
   case ISD::UMULO:
     // We generate a UMUL_LOHI and then check if the high word is 0.
     ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
     Value = DAG.getNode(ISD::UMUL_LOHI, dl,
                         DAG.getVTList(Op.getValueType(), Op.getValueType()),
                         LHS, RHS);
     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
                               DAG.getConstant(0, dl, MVT::i32));
     Value = Value.getValue(0); // We only want the low 32 bits for the result.
     break;
   case ISD::SMULO:
     // We generate a SMUL_LOHI and then check if all the bits of the high word
     // are the same as the sign bit of the low word.
     ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
     Value = DAG.getNode(ISD::SMUL_LOHI, dl,
                         DAG.getVTList(Op.getValueType(), Op.getValueType()),
                         LHS, RHS);
     OverflowCmp = DAG.getNode(ARMISD::CMP, dl, MVT::Glue, Value.getValue(1),
                               DAG.getNode(ISD::SRA, dl, Op.getValueType(),
                                           Value.getValue(0),
                                           DAG.getConstant(31, dl, MVT::i32)));
     Value = Value.getValue(0); // We only want the low 32 bits for the result.
     break;
   } // switch (...)
 
   return std::make_pair(Value, OverflowCmp);
 }
 
 SDValue
 ARMTargetLowering::LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const {
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
     return SDValue();
 
   SDValue Value, OverflowCmp;
   SDValue ARMcc;
   std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDLoc dl(Op);
   // We use 0 and 1 as false and true values.
   SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
   SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
   EVT VT = Op.getValueType();
 
   SDValue Overflow = DAG.getNode(ARMISD::CMOV, dl, VT, TVal, FVal,
                                  ARMcc, CCR, OverflowCmp);
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
 }
 
 static SDValue ConvertBooleanCarryToCarryFlag(SDValue BoolCarry,
                                               SelectionDAG &DAG) {
   SDLoc DL(BoolCarry);
   EVT CarryVT = BoolCarry.getValueType();
 
   // This converts the boolean value carry into the carry flag by doing
   // ARMISD::SUBC Carry, 1
   SDValue Carry = DAG.getNode(ARMISD::SUBC, DL,
                               DAG.getVTList(CarryVT, MVT::i32),
                               BoolCarry, DAG.getConstant(1, DL, CarryVT));
   return Carry.getValue(1);
 }
 
 static SDValue ConvertCarryFlagToBooleanCarry(SDValue Flags, EVT VT,
                                               SelectionDAG &DAG) {
   SDLoc DL(Flags);
 
   // Now convert the carry flag into a boolean carry. We do this
   // using ARMISD:ADDE 0, 0, Carry
   return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
                      DAG.getConstant(0, DL, MVT::i32),
                      DAG.getConstant(0, DL, MVT::i32), Flags);
 }
 
 SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
                                              SelectionDAG &DAG) const {
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
     return SDValue();
 
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDLoc dl(Op);
 
   EVT VT = Op.getValueType();
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
   SDValue Value;
   SDValue Overflow;
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Unknown overflow instruction!");
   case ISD::UADDO:
     Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
     // Convert the carry flag into a boolean value.
     Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
     break;
   case ISD::USUBO: {
     Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
     // Convert the carry flag into a boolean value.
     Overflow = ConvertCarryFlagToBooleanCarry(Value.getValue(1), VT, DAG);
     // ARMISD::SUBC returns 0 when we have to borrow, so make it an overflow
     // value. So compute 1 - C.
     Overflow = DAG.getNode(ISD::SUB, dl, MVT::i32,
                            DAG.getConstant(1, dl, MVT::i32), Overflow);
     break;
   }
   }
 
   return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
 }
 
 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
   SDValue SelectFalse = Op.getOperand(2);
   SDLoc dl(Op);
   unsigned Opc = Cond.getOpcode();
 
   if (Cond.getResNo() == 1 &&
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
        Opc == ISD::USUBO)) {
     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
       return SDValue();
 
     SDValue Value, OverflowCmp;
     SDValue ARMcc;
     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     EVT VT = Op.getValueType();
 
     return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, CCR,
                    OverflowCmp, DAG);
   }
 
   // Convert:
   //
   //   (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
   //   (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
   //
   if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
     const ConstantSDNode *CMOVTrue =
       dyn_cast<ConstantSDNode>(Cond.getOperand(0));
     const ConstantSDNode *CMOVFalse =
       dyn_cast<ConstantSDNode>(Cond.getOperand(1));
 
     if (CMOVTrue && CMOVFalse) {
       unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
       unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
 
       SDValue True;
       SDValue False;
       if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
         True = SelectTrue;
         False = SelectFalse;
       } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
         True = SelectFalse;
         False = SelectTrue;
       }
 
       if (True.getNode() && False.getNode()) {
         EVT VT = Op.getValueType();
         SDValue ARMcc = Cond.getOperand(2);
         SDValue CCR = Cond.getOperand(3);
         SDValue Cmp = duplicateCmp(Cond.getOperand(4), DAG);
         assert(True.getValueType() == VT);
         return getCMOV(dl, VT, True, False, ARMcc, CCR, Cmp, DAG);
       }
     }
   }
 
   // ARM's BooleanContents value is UndefinedBooleanContent. Mask out the
   // undefined bits before doing a full-word comparison with zero.
   Cond = DAG.getNode(ISD::AND, dl, Cond.getValueType(), Cond,
                      DAG.getConstant(1, dl, Cond.getValueType()));
 
   return DAG.getSelectCC(dl, Cond,
                          DAG.getConstant(0, dl, Cond.getValueType()),
                          SelectTrue, SelectFalse, ISD::SETNE);
 }
 
 static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
                                  bool &swpCmpOps, bool &swpVselOps) {
   // Start by selecting the GE condition code for opcodes that return true for
   // 'equality'
   if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
       CC == ISD::SETULE)
     CondCode = ARMCC::GE;
 
   // and GT for opcodes that return false for 'equality'.
   else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
            CC == ISD::SETULT)
     CondCode = ARMCC::GT;
 
   // Since we are constrained to GE/GT, if the opcode contains 'less', we need
   // to swap the compare operands.
   if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
       CC == ISD::SETULT)
     swpCmpOps = true;
 
   // Both GT and GE are ordered comparisons, and return false for 'unordered'.
   // If we have an unordered opcode, we need to swap the operands to the VSEL
   // instruction (effectively negating the condition).
   //
   // This also has the effect of swapping which one of 'less' or 'greater'
   // returns true, so we also swap the compare operands. It also switches
   // whether we return true for 'equality', so we compensate by picking the
   // opposite condition code to our original choice.
   if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
       CC == ISD::SETUGT) {
     swpCmpOps = !swpCmpOps;
     swpVselOps = !swpVselOps;
     CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
   }
 
   // 'ordered' is 'anything but unordered', so use the VS condition code and
   // swap the VSEL operands.
   if (CC == ISD::SETO) {
     CondCode = ARMCC::VS;
     swpVselOps = true;
   }
 
   // 'unordered or not equal' is 'anything but equal', so use the EQ condition
   // code and swap the VSEL operands.
   if (CC == ISD::SETUNE) {
     CondCode = ARMCC::EQ;
     swpVselOps = true;
   }
 }
 
 SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
                                    SDValue TrueVal, SDValue ARMcc, SDValue CCR,
                                    SDValue Cmp, SelectionDAG &DAG) const {
   if (Subtarget->isFPOnlySP() && VT == MVT::f64) {
     FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
                            DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
     TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
                           DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
 
     SDValue TrueLow = TrueVal.getValue(0);
     SDValue TrueHigh = TrueVal.getValue(1);
     SDValue FalseLow = FalseVal.getValue(0);
     SDValue FalseHigh = FalseVal.getValue(1);
 
     SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
                               ARMcc, CCR, Cmp);
     SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
                                ARMcc, CCR, duplicateCmp(Cmp, DAG));
 
     return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
   } else {
     return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
                        Cmp);
   }
 }
 
 static bool isGTorGE(ISD::CondCode CC) {
   return CC == ISD::SETGT || CC == ISD::SETGE;
 }
 
 static bool isLTorLE(ISD::CondCode CC) {
   return CC == ISD::SETLT || CC == ISD::SETLE;
 }
 
 // See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
 // All of these conditions (and their <= and >= counterparts) will do:
 //          x < k ? k : x
 //          x > k ? x : k
 //          k < x ? x : k
 //          k > x ? k : x
 static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
                             const SDValue TrueVal, const SDValue FalseVal,
                             const ISD::CondCode CC, const SDValue K) {
   return (isGTorGE(CC) &&
           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
          (isLTorLE(CC) &&
           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
 }
 
 // Similar to isLowerSaturate(), but checks for upper-saturating conditions.
 static bool isUpperSaturate(const SDValue LHS, const SDValue RHS,
                             const SDValue TrueVal, const SDValue FalseVal,
                             const ISD::CondCode CC, const SDValue K) {
   return (isGTorGE(CC) &&
           ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal))) ||
          (isLTorLE(CC) &&
           ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal)));
 }
 
 // Check if two chained conditionals could be converted into SSAT or USAT.
 //
 // SSAT can replace a set of two conditional selectors that bound a number to an
 // interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
 //
 //     x < -k ? -k : (x > k ? k : x)
 //     x < -k ? -k : (x < k ? x : k)
 //     x > -k ? (x > k ? k : x) : -k
 //     x < k ? (x < -k ? -k : x) : k
 //     etc.
 //
 // USAT works similarily to SSAT but bounds on the interval [0, k] where k + 1 is
 // a power of 2.
 //
 // It returns true if the conversion can be done, false otherwise.
 // Additionally, the variable is returned in parameter V, the constant in K and
 // usat is set to true if the conditional represents an unsigned saturation
 static bool isSaturatingConditional(const SDValue &Op, SDValue &V,
                                     uint64_t &K, bool &usat) {
   SDValue LHS1 = Op.getOperand(0);
   SDValue RHS1 = Op.getOperand(1);
   SDValue TrueVal1 = Op.getOperand(2);
   SDValue FalseVal1 = Op.getOperand(3);
   ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
 
   const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
   if (Op2.getOpcode() != ISD::SELECT_CC)
     return false;
 
   SDValue LHS2 = Op2.getOperand(0);
   SDValue RHS2 = Op2.getOperand(1);
   SDValue TrueVal2 = Op2.getOperand(2);
   SDValue FalseVal2 = Op2.getOperand(3);
   ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
 
   // Find out which are the constants and which are the variables
   // in each conditional
   SDValue *K1 = isa<ConstantSDNode>(LHS1) ? &LHS1 : isa<ConstantSDNode>(RHS1)
                                                         ? &RHS1
                                                         : nullptr;
   SDValue *K2 = isa<ConstantSDNode>(LHS2) ? &LHS2 : isa<ConstantSDNode>(RHS2)
                                                         ? &RHS2
                                                         : nullptr;
   SDValue K2Tmp = isa<ConstantSDNode>(TrueVal2) ? TrueVal2 : FalseVal2;
   SDValue V1Tmp = (K1 && *K1 == LHS1) ? RHS1 : LHS1;
   SDValue V2Tmp = (K2 && *K2 == LHS2) ? RHS2 : LHS2;
   SDValue V2 = (K2Tmp == TrueVal2) ? FalseVal2 : TrueVal2;
 
   // We must detect cases where the original operations worked with 16- or
   // 8-bit values. In such case, V2Tmp != V2 because the comparison operations
   // must work with sign-extended values but the select operations return
   // the original non-extended value.
   SDValue V2TmpReg = V2Tmp;
   if (V2Tmp->getOpcode() == ISD::SIGN_EXTEND_INREG)
     V2TmpReg = V2Tmp->getOperand(0);
 
   // Check that the registers and the constants have the correct values
   // in both conditionals
   if (!K1 || !K2 || *K1 == Op2 || *K2 != K2Tmp || V1Tmp != V2Tmp ||
       V2TmpReg != V2)
     return false;
 
   // Figure out which conditional is saturating the lower/upper bound.
   const SDValue *LowerCheckOp =
       isLowerSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
           ? &Op
           : isLowerSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
                 ? &Op2
                 : nullptr;
   const SDValue *UpperCheckOp =
       isUpperSaturate(LHS1, RHS1, TrueVal1, FalseVal1, CC1, *K1)
           ? &Op
           : isUpperSaturate(LHS2, RHS2, TrueVal2, FalseVal2, CC2, *K2)
                 ? &Op2
                 : nullptr;
 
   if (!UpperCheckOp || !LowerCheckOp || LowerCheckOp == UpperCheckOp)
     return false;
 
   // Check that the constant in the lower-bound check is
   // the opposite of the constant in the upper-bound check
   // in 1's complement.
   int64_t Val1 = cast<ConstantSDNode>(*K1)->getSExtValue();
   int64_t Val2 = cast<ConstantSDNode>(*K2)->getSExtValue();
   int64_t PosVal = std::max(Val1, Val2);
   int64_t NegVal = std::min(Val1, Val2);
 
   if (((Val1 > Val2 && UpperCheckOp == &Op) ||
        (Val1 < Val2 && UpperCheckOp == &Op2)) &&
       isPowerOf2_64(PosVal + 1)) {
 
     // Handle the difference between USAT (unsigned) and SSAT (signed) saturation
     if (Val1 == ~Val2)
       usat = false;
     else if (NegVal == 0)
       usat = true;
     else
       return false;
 
     V = V2;
     K = (uint64_t)PosVal; // At this point, PosVal is guaranteed to be positive
 
     return true;
   }
 
   return false;
 }
 
 // Check if a condition of the type x < k ? k : x can be converted into a
 // bit operation instead of conditional moves.
 // Currently this is allowed given:
 // - The conditions and values match up
 // - k is 0 or -1 (all ones)
 // This function will not check the last condition, thats up to the caller
 // It returns true if the transformation can be made, and in such case
 // returns x in V, and k in SatK.
 static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V,
                                          SDValue &SatK)
 {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
 
   SDValue *K = isa<ConstantSDNode>(LHS) ? &LHS : isa<ConstantSDNode>(RHS)
                                                ? &RHS
                                                : nullptr;
 
   // No constant operation in comparison, early out
   if (!K)
     return false;
 
   SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
   V = (KTmp == TrueVal) ? FalseVal : TrueVal;
   SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
 
   // If the constant on left and right side, or variable on left and right,
   // does not match, early out
   if (*K != KTmp || V != VTmp)
     return false;
 
   if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
     SatK = *K;
     return true;
   }
 
   return false;
 }
 
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
 
   // Try to convert two saturating conditional selects into a single SSAT
   SDValue SatValue;
   uint64_t SatConstant;
   bool SatUSat;
   if (((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2()) &&
       isSaturatingConditional(Op, SatValue, SatConstant, SatUSat)) {
     if (SatUSat)
       return DAG.getNode(ARMISD::USAT, dl, VT, SatValue,
                          DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
     else
       return DAG.getNode(ARMISD::SSAT, dl, VT, SatValue,
                          DAG.getConstant(countTrailingOnes(SatConstant), dl, VT));
   }
 
   // Try to convert expressions of the form x < k ? k : x (and similar forms)
   // into more efficient bit operations, which is possible when k is 0 or -1
   // On ARM and Thumb-2 which have flexible operand 2 this will result in
   // single instructions. On Thumb the shift and the bit operation will be two
   // instructions.
   // Only allow this transformation on full-width (32-bit) operations
   SDValue LowerSatConstant;
   if (VT == MVT::i32 &&
       isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
     SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
                                  DAG.getConstant(31, dl, VT));
     if (isNullConstant(LowerSatConstant)) {
       SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
                                       DAG.getAllOnesConstant(dl, VT));
       return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
     } else if (isAllOnesConstant(LowerSatConstant))
       return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
   }
 
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
 
   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
                                                     dl);
 
     // If softenSetCCOperands only returned one value, we should compare it to
     // zero.
     if (!RHS.getNode()) {
       RHS = DAG.getConstant(0, dl, LHS.getValueType());
       CC = ISD::SETNE;
     }
   }
 
   if (LHS.getValueType() == MVT::i32) {
     // Try to generate VSEL on ARMv8.
     // The VSEL instruction can't use all the usual ARM condition
     // codes: it only has two bits to select the condition code, so it's
     // constrained to use only GE, GT, VS and EQ.
     //
     // To implement all the various ISD::SETXXX opcodes, we sometimes need to
     // swap the operands of the previous compare instruction (effectively
     // inverting the compare condition, swapping 'less' and 'greater') and
     // sometimes need to swap the operands to the VSEL (which inverts the
     // condition in the sense of firing whenever the previous condition didn't)
     if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
                                     TrueVal.getValueType() == MVT::f64)) {
       ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
       if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
           CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
         CC = ISD::getSetCCInverse(CC, true);
         std::swap(TrueVal, FalseVal);
       }
     }
 
     SDValue ARMcc;
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
     return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
   bool InvalidOnQNaN;
   FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
 
   // Normalize the fp compare. If RHS is zero we keep it there so we match
   // CMPFPw0 instead of CMPFP.
   if (Subtarget->hasFPARMv8() && !isFloatingPointZero(RHS) &&
      (TrueVal.getValueType() == MVT::f16 ||
       TrueVal.getValueType() == MVT::f32 ||
       TrueVal.getValueType() == MVT::f64)) {
     bool swpCmpOps = false;
     bool swpVselOps = false;
     checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
 
     if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
         CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
       if (swpCmpOps)
         std::swap(LHS, RHS);
       if (swpVselOps)
         std::swap(TrueVal, FalseVal);
     }
   }
 
   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
   if (CondCode2 != ARMCC::AL) {
     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
     // FIXME: Needs another CMP because flag can have but one use.
     SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
   }
   return Result;
 }
 
 /// canChangeToInt - Given the fp compare operand, return true if it is suitable
 /// to morph to an integer compare sequence.
 static bool canChangeToInt(SDValue Op, bool &SeenZero,
                            const ARMSubtarget *Subtarget) {
   SDNode *N = Op.getNode();
   if (!N->hasOneUse())
     // Otherwise it requires moving the value from fp to integer registers.
     return false;
   if (!N->getNumValues())
     return false;
   EVT VT = Op.getValueType();
   if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
     // f32 case is generally profitable. f64 case only makes sense when vcmpe +
     // vmrs are very slow, e.g. cortex-a8.
     return false;
 
   if (isFloatingPointZero(Op)) {
     SeenZero = true;
     return true;
   }
   return ISD::isNormalLoad(N);
 }
 
 static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
   if (isFloatingPointZero(Op))
     return DAG.getConstant(0, SDLoc(Op), MVT::i32);
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
     return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
                        Ld->getPointerInfo(), Ld->getAlignment(),
                        Ld->getMemOperand()->getFlags());
 
   llvm_unreachable("Unknown VFP cmp argument!");
 }
 
 static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
                            SDValue &RetVal1, SDValue &RetVal2) {
   SDLoc dl(Op);
 
   if (isFloatingPointZero(Op)) {
     RetVal1 = DAG.getConstant(0, dl, MVT::i32);
     RetVal2 = DAG.getConstant(0, dl, MVT::i32);
     return;
   }
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
     SDValue Ptr = Ld->getBasePtr();
     RetVal1 =
         DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
                     Ld->getAlignment(), Ld->getMemOperand()->getFlags());
 
     EVT PtrType = Ptr.getValueType();
     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
     SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
                                  PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
     RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
                           Ld->getPointerInfo().getWithOffset(4), NewAlign,
                           Ld->getMemOperand()->getFlags());
     return;
   }
 
   llvm_unreachable("Unknown VFP cmp argument!");
 }
 
 /// OptimizeVFPBrcond - With -enable-unsafe-fp-math, it's legal to optimize some
 /// f32 and even f64 comparisons to integer ones.
 SDValue
 ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
   SDValue LHS = Op.getOperand(2);
   SDValue RHS = Op.getOperand(3);
   SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
 
   bool LHSSeenZero = false;
   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
   bool RHSSeenZero = false;
   bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
   if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
     // If unsafe fp math optimization is enabled and there are no other uses of
     // the CMP operands, and the condition code is EQ or NE, we can optimize it
     // to an integer comparison.
     if (CC == ISD::SETOEQ)
       CC = ISD::SETEQ;
     else if (CC == ISD::SETUNE)
       CC = ISD::SETNE;
 
     SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
     SDValue ARMcc;
     if (LHS.getValueType() == MVT::f32) {
       LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
                         bitcastf32Toi32(LHS, DAG), Mask);
       RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
                         bitcastf32Toi32(RHS, DAG), Mask);
       SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
       SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
       return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
                          Chain, Dest, ARMcc, CCR, Cmp);
     }
 
     SDValue LHS1, LHS2;
     SDValue RHS1, RHS2;
     expandf64Toi32(LHS, DAG, LHS1, LHS2);
     expandf64Toi32(RHS, DAG, RHS1, RHS2);
     LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
     RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
     ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
     ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
     return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
   }
 
   return SDValue();
 }
 
 SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Cond = Op.getOperand(1);
   SDValue Dest = Op.getOperand(2);
   SDLoc dl(Op);
 
   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
   // instruction.
   unsigned Opc = Cond.getOpcode();
   bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
                       !Subtarget->isThumb1Only();
   if (Cond.getResNo() == 1 &&
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
        Opc == ISD::USUBO || OptimizeMul)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
       return SDValue();
 
     // The actual operation with overflow check.
     SDValue Value, OverflowCmp;
     SDValue ARMcc;
     std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
 
     // Reverse the condition code.
     ARMCC::CondCodes CondCode =
         (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
     CondCode = ARMCC::getOppositeCondition(CondCode);
     ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
 
     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
                        OverflowCmp);
   }
 
   return SDValue();
 }
 
 SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
   SDValue LHS = Op.getOperand(2);
   SDValue RHS = Op.getOperand(3);
   SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
 
   if (Subtarget->isFPOnlySP() && LHS.getValueType() == MVT::f64) {
     DAG.getTargetLoweringInfo().softenSetCCOperands(DAG, MVT::f64, LHS, RHS, CC,
                                                     dl);
 
     // If softenSetCCOperands only returned one value, we should compare it to
     // zero.
     if (!RHS.getNode()) {
       RHS = DAG.getConstant(0, dl, LHS.getValueType());
       CC = ISD::SETNE;
     }
   }
 
   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
   // instruction.
   unsigned Opc = LHS.getOpcode();
   bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
                       !Subtarget->isThumb1Only();
   if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
        Opc == ISD::USUBO || OptimizeMul) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
       return SDValue();
 
     // The actual operation with overflow check.
     SDValue Value, OverflowCmp;
     SDValue ARMcc;
     std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
 
     if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
       // Reverse the condition code.
       ARMCC::CondCodes CondCode =
           (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
       CondCode = ARMCC::getOppositeCondition(CondCode);
       ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
     }
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
 
     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, CCR,
                        OverflowCmp);
   }
 
   if (LHS.getValueType() == MVT::i32) {
     SDValue ARMcc;
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
                        Chain, Dest, ARMcc, CCR, Cmp);
   }
 
   if (getTargetMachine().Options.UnsafeFPMath &&
       (CC == ISD::SETEQ || CC == ISD::SETOEQ ||
        CC == ISD::SETNE || CC == ISD::SETUNE)) {
     if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
       return Result;
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
   bool InvalidOnQNaN;
   FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
 
   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
   SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
   if (CondCode2 != ARMCC::AL) {
     ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
     Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
   }
   return Res;
 }
 
 SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
   SDLoc dl(Op);
 
   EVT PTy = getPointerTy(DAG.getDataLayout());
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
   SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
   Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
   Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
   SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
   if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
     // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
     // which does another jump to the destination. This also makes it easier
     // to translate it to TBB / TBH later (Thumb2 only).
     // FIXME: This might not work if the function is extremely large.
     return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
                        Addr, Op.getOperand(2), JTI);
   }
   if (isPositionIndependent() || Subtarget->isROPI()) {
     Addr =
         DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
     Chain = Addr.getValue(1);
     Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
   } else {
     Addr =
         DAG.getLoad(PTy, dl, Chain, Addr,
                     MachinePointerInfo::getJumpTable(DAG.getMachineFunction()));
     Chain = Addr.getValue(1);
     return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
   }
 }
 
 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
 
   if (Op.getValueType().getVectorElementType() == MVT::i32) {
     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
       return Op;
     return DAG.UnrollVectorOp(Op.getNode());
   }
 
   assert(Op.getOperand(0).getValueType() == MVT::v4f32 &&
          "Invalid type for custom lowering!");
   if (VT != MVT::v4i16)
     return DAG.UnrollVectorOp(Op.getNode());
 
   Op = DAG.getNode(Op.getOpcode(), dl, MVT::v4i32, Op.getOperand(0));
   return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
 }
 
 SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   if (VT.isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
   if (Subtarget->isFPOnlySP() && Op.getOperand(0).getValueType() == MVT::f64) {
     RTLIB::Libcall LC;
     if (Op.getOpcode() == ISD::FP_TO_SINT)
       LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(),
                               Op.getValueType());
     else
       LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(),
                               Op.getValueType());
     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
                        /*isSigned*/ false, SDLoc(Op)).first;
   }
 
   return Op;
 }
 
 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
 
   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
     if (VT.getVectorElementType() == MVT::f32)
       return Op;
     return DAG.UnrollVectorOp(Op.getNode());
   }
 
   assert(Op.getOperand(0).getValueType() == MVT::v4i16 &&
          "Invalid type for custom lowering!");
   if (VT != MVT::v4f32)
     return DAG.UnrollVectorOp(Op.getNode());
 
   unsigned CastOpc;
   unsigned Opc;
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Invalid opcode!");
   case ISD::SINT_TO_FP:
     CastOpc = ISD::SIGN_EXTEND;
     Opc = ISD::SINT_TO_FP;
     break;
   case ISD::UINT_TO_FP:
     CastOpc = ISD::ZERO_EXTEND;
     Opc = ISD::UINT_TO_FP;
     break;
   }
 
   Op = DAG.getNode(CastOpc, dl, MVT::v4i32, Op.getOperand(0));
   return DAG.getNode(Opc, dl, VT, Op);
 }
 
 SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   if (VT.isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
   if (Subtarget->isFPOnlySP() && Op.getValueType() == MVT::f64) {
     RTLIB::Libcall LC;
     if (Op.getOpcode() == ISD::SINT_TO_FP)
       LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
                               Op.getValueType());
     else
       LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
                               Op.getValueType());
     return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
                        /*isSigned*/ false, SDLoc(Op)).first;
   }
 
   return Op;
 }
 
 SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // Implement fcopysign with a fabs and a conditional fneg.
   SDValue Tmp0 = Op.getOperand(0);
   SDValue Tmp1 = Op.getOperand(1);
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   EVT SrcVT = Tmp1.getValueType();
   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
     Tmp0.getOpcode() == ARMISD::VMOVDRR;
   bool UseNEON = !InGPR && Subtarget->hasNEON();
 
   if (UseNEON) {
     // Use VBSL to copy the sign bit.
     unsigned EncodedVal = ARM_AM::createNEONModImm(0x6, 0x80);
     SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
                                DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
     EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
     if (VT == MVT::f64)
       Mask = DAG.getNode(ARMISD::VSHL, dl, OpVT,
                          DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
                          DAG.getConstant(32, dl, MVT::i32));
     else /*if (VT == MVT::f32)*/
       Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
     if (SrcVT == MVT::f32) {
       Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
       if (VT == MVT::f64)
         Tmp1 = DAG.getNode(ARMISD::VSHL, dl, OpVT,
                            DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
                            DAG.getConstant(32, dl, MVT::i32));
     } else if (VT == MVT::f32)
       Tmp1 = DAG.getNode(ARMISD::VSHRu, dl, MVT::v1i64,
                          DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
                          DAG.getConstant(32, dl, MVT::i32));
     Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
     Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
 
     SDValue AllOnes = DAG.getTargetConstant(ARM_AM::createNEONModImm(0xe, 0xff),
                                             dl, MVT::i32);
     AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
     SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
                                   DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
 
     SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
                               DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
                               DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
     if (VT == MVT::f32) {
       Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
       Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
                         DAG.getConstant(0, dl, MVT::i32));
     } else {
       Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
     }
 
     return Res;
   }
 
   // Bitcast operand 1 to i32.
   if (SrcVT == MVT::f64)
     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
                        Tmp1).getValue(1);
   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
 
   // Or in the signbit with integer operations.
   SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
   SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
   Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
   if (VT == MVT::f32) {
     Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
                        DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
     return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
                        DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
   }
 
   // f64: Or the high part with signbit and then combine two parts.
   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
                      Tmp0);
   SDValue Lo = Tmp0.getValue(0);
   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
 }
 
 SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setReturnAddressIsTaken(true);
 
   if (verifyReturnAddressArgumentIsConstant(Op, DAG))
     return SDValue();
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
     return DAG.getLoad(VT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
                        MachinePointerInfo());
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
   unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
   return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
 }
 
 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   const ARMBaseRegisterInfo &ARI =
     *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   unsigned FrameReg = ARI.getFrameRegister(MF);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
                             MachinePointerInfo());
   return FrameAddr;
 }
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 unsigned ARMTargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                               SelectionDAG &DAG) const {
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", ARM::SP)
                        .Default(0);
   if (Reg)
     return Reg;
   report_fatal_error(Twine("Invalid register name \""
                               + StringRef(RegName)  + "\"."));
 }
 
 // Result is 64 bit value so split into two 32 bit values and return as a
 // pair of values.
 static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                 SelectionDAG &DAG) {
   SDLoc DL(N);
 
   // This function is only supposed to be called for i64 type destination.
   assert(N->getValueType(0) == MVT::i64
           && "ExpandREAD_REGISTER called for non-i64 type result.");
 
   SDValue Read = DAG.getNode(ISD::READ_REGISTER, DL,
                              DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
                              N->getOperand(0),
                              N->getOperand(1));
 
   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
                     Read.getValue(1)));
   Results.push_back(Read.getOperand(0));
 }
 
 /// \p BC is a bitcast that is about to be turned into a VMOVDRR.
 /// When \p DstVT, the destination type of \p BC, is on the vector
 /// register bank and the source of bitcast, \p Op, operates on the same bank,
 /// it might be possible to combine them, such that everything stays on the
 /// vector register bank.
 /// \p return The node that would replace \p BT, if the combine
 /// is possible.
 static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
                                                 SelectionDAG &DAG) {
   SDValue Op = BC->getOperand(0);
   EVT DstVT = BC->getValueType(0);
 
   // The only vector instruction that can produce a scalar (remember,
   // since the bitcast was about to be turned into VMOVDRR, the source
   // type is i64) from a vector is EXTRACT_VECTOR_ELT.
   // Moreover, we can do this combine only if there is one use.
   // Finally, if the destination type is not a vector, there is not
   // much point on forcing everything on the vector bank.
   if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
       !Op.hasOneUse())
     return SDValue();
 
   // If the index is not constant, we will introduce an additional
   // multiply that will stick.
   // Give up in that case.
   ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   if (!Index)
     return SDValue();
   unsigned DstNumElt = DstVT.getVectorNumElements();
 
   // Compute the new index.
   const APInt &APIntIndex = Index->getAPIntValue();
   APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
   NewIndex *= APIntIndex;
   // Check if the new constant index fits into i32.
   if (NewIndex.getBitWidth() > 32)
     return SDValue();
 
   // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
   // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
   SDLoc dl(Op);
   SDValue ExtractSrc = Op.getOperand(0);
   EVT VecVT = EVT::getVectorVT(
       *DAG.getContext(), DstVT.getScalarType(),
       ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
   SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
                      DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
 }
 
 /// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
 /// vectors), since the legalizer won't know what to do with that.
 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
                              const ARMSubtarget *Subtarget) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc dl(N);
   SDValue Op = N->getOperand(0);
 
   // This function is only supposed to be called for i64 types, either as the
   // source or destination of the bit convert.
   EVT SrcVT = Op.getValueType();
   EVT DstVT = N->getValueType(0);
   const bool HasFullFP16 = Subtarget->hasFullFP16();
 
   if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
      // FullFP16: half values are passed in S-registers, and we don't
      // need any of the bitcast and moves:
      //
      // t2: f32,ch = CopyFromReg t0, Register:f32 %0
      //   t5: i32 = bitcast t2
      // t18: f16 = ARMISD::VMOVhr t5
      if (Op.getOpcode() != ISD::CopyFromReg ||
          Op.getValueType() != MVT::f32)
        return SDValue();
 
      auto Move = N->use_begin();
      if (Move->getOpcode() != ARMISD::VMOVhr)
        return SDValue();
 
      SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
      SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
      DAG.ReplaceAllUsesWith(*Move, &Copy);
      return Copy;
   }
 
   if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
     if (!HasFullFP16)
       return SDValue();
     // SoftFP: read half-precision arguments:
     //
     // t2: i32,ch = ...
     //        t7: i16 = truncate t2 <~~~~ Op
     //      t8: f16 = bitcast t7    <~~~~ N
     //
     if (Op.getOperand(0).getValueType() == MVT::i32)
       return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
                          MVT::f16, Op.getOperand(0));
 
     return SDValue();
   }
 
   // Half-precision return values
   if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
     if (!HasFullFP16)
       return SDValue();
     //
     //          t11: f16 = fadd t8, t10
     //        t12: i16 = bitcast t11       <~~~ SDNode N
     //      t13: i32 = zero_extend t12
     //    t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
     //  t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
     //
     // transform this into:
     //
     //    t20: i32 = ARMISD::VMOVrh t11
     //  t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
     //
     auto ZeroExtend = N->use_begin();
     if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
         ZeroExtend->getValueType(0) != MVT::i32)
       return SDValue();
 
     auto Copy = ZeroExtend->use_begin();
     if (Copy->getOpcode() == ISD::CopyToReg &&
         Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
       SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
       DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
       return Cvt;
     }
     return SDValue();
   }
 
   if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
     return SDValue();
 
   // Turn i64->f64 into VMOVDRR.
   if (SrcVT == MVT::i64 && TLI.isTypeLegal(DstVT)) {
     // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
     // if we can combine the bitcast with its source.
     if (SDValue Val = CombineVMOVDRRCandidateWithVecOp(N, DAG))
       return Val;
 
     SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                              DAG.getConstant(0, dl, MVT::i32));
     SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
                              DAG.getConstant(1, dl, MVT::i32));
     return DAG.getNode(ISD::BITCAST, dl, DstVT,
                        DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
   }
 
   // Turn f64->i64 into VMOVRRD.
   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
     SDValue Cvt;
     if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
         SrcVT.getVectorNumElements() > 1)
       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
                         DAG.getVTList(MVT::i32, MVT::i32),
                         DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
     else
       Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
                         DAG.getVTList(MVT::i32, MVT::i32), Op);
     // Merge the pieces into a single i64 value.
     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
   }
 
   return SDValue();
 }
 
 /// getZeroVector - Returns a vector of specified type with all zero elements.
 /// Zero vectors are used to represent vector negation and in those cases
 /// will be implemented with the NEON VNEG instruction.  However, VNEG does
 /// not support i64 elements, so sometimes the zero vectors will need to be
 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
 /// zero vector.
 static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert(VT.isVector() && "Expected a vector type");
   // The canonical modified immediate encoding of a zero vector is....0!
   SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
   EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
   SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
   return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
 }
 
 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
 SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
                                                 SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
   SDLoc dl(Op);
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
   SDValue ARMcc;
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
 
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
                                    DAG.getConstant(VTBits, dl, MVT::i32));
   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
   SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
   SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
                             ISD::SETGE, ARMcc, DAG, dl);
   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift,
                            ARMcc, CCR, CmpLo);
 
   SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
   SDValue HiBigShift = Opc == ISD::SRA
                            ? DAG.getNode(Opc, dl, VT, ShOpHi,
                                          DAG.getConstant(VTBits - 1, dl, VT))
                            : DAG.getConstant(0, dl, VT);
   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
                             ISD::SETGE, ARMcc, DAG, dl);
   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
                            ARMcc, CCR, CmpHi);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
 }
 
 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
 SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
                                                SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
   SDLoc dl(Op);
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
   SDValue ARMcc;
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
 
   assert(Op.getOpcode() == ISD::SHL_PARTS);
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
                                  DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
   SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
 
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
                                    DAG.getConstant(VTBits, dl, MVT::i32));
   SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
   SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
                             ISD::SETGE, ARMcc, DAG, dl);
   SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift,
                            ARMcc, CCR, CmpHi);
 
   SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
                           ISD::SETGE, ARMcc, DAG, dl);
   SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
   SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
                            DAG.getConstant(0, dl, VT), ARMcc, CCR, CmpLo);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                                             SelectionDAG &DAG) const {
   // The rounding mode is in bits 23:22 of the FPSCR.
   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
   // so that the shift + and get folded into a bitfield extract.
   SDLoc dl(Op);
   SDValue Ops[] = { DAG.getEntryNode(),
                     DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
 
   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
                                   DAG.getConstant(1U << 22, dl, MVT::i32));
   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
                               DAG.getConstant(22, dl, MVT::i32));
   return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
                      DAG.getConstant(3, dl, MVT::i32));
 }
 
 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
                          const ARMSubtarget *ST) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   if (VT.isVector()) {
     assert(ST->hasNEON());
 
     // Compute the least significant set bit: LSB = X & -X
     SDValue X = N->getOperand(0);
     SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
     SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
 
     EVT ElemTy = VT.getVectorElementType();
 
     if (ElemTy == MVT::i8) {
       // Compute with: cttz(x) = ctpop(lsb - 1)
       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
                                 DAG.getTargetConstant(1, dl, ElemTy));
       SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
       return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
     }
 
     if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
         (N->getOpcode() == ISD::CTTZ_ZERO_UNDEF)) {
       // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
       unsigned NumBits = ElemTy.getSizeInBits();
       SDValue WidthMinus1 =
           DAG.getNode(ARMISD::VMOVIMM, dl, VT,
                       DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
       SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
       return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
     }
 
     // Compute with: cttz(x) = ctpop(lsb - 1)
 
     // Since we can only compute the number of bits in a byte with vcnt.8, we
     // have to gather the result with pairwise addition (vpaddl) for i16, i32,
     // and i64.
 
     // Compute LSB - 1.
     SDValue Bits;
     if (ElemTy == MVT::i64) {
       // Load constant 0xffff'ffff'ffff'ffff to register.
       SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
                                DAG.getTargetConstant(0x1eff, dl, MVT::i32));
       Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
     } else {
       SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
                                 DAG.getTargetConstant(1, dl, ElemTy));
       Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
     }
 
     // Count #bits with vcnt.8.
     EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
     SDValue BitsVT8 = DAG.getNode(ISD::BITCAST, dl, VT8Bit, Bits);
     SDValue Cnt8 = DAG.getNode(ISD::CTPOP, dl, VT8Bit, BitsVT8);
 
     // Gather the #bits with vpaddl (pairwise add.)
     EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
     SDValue Cnt16 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT16Bit,
         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
         Cnt8);
     if (ElemTy == MVT::i16)
       return Cnt16;
 
     EVT VT32Bit = VT.is64BitVector() ? MVT::v2i32 : MVT::v4i32;
     SDValue Cnt32 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT32Bit,
         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
         Cnt16);
     if (ElemTy == MVT::i32)
       return Cnt32;
 
     assert(ElemTy == MVT::i64);
     SDValue Cnt64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
         DAG.getTargetConstant(Intrinsic::arm_neon_vpaddlu, dl, MVT::i32),
         Cnt32);
     return Cnt64;
   }
 
   if (!ST->hasV6T2Ops())
     return SDValue();
 
   SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
   return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
 }
 
 /// getCTPOP16BitCounts - Returns a v8i8/v16i8 vector containing the bit-count
 /// for each 16-bit element from operand, repeated.  The basic idea is to
 /// leverage vcnt to get the 8-bit counts, gather and add the results.
 ///
 /// Trace for v4i16:
 /// input    = [v0    v1    v2    v3   ] (vi 16-bit element)
 /// cast: N0 = [w0 w1 w2 w3 w4 w5 w6 w7] (v0 = [w0 w1], wi 8-bit element)
 /// vcnt: N1 = [b0 b1 b2 b3 b4 b5 b6 b7] (bi = bit-count of 8-bit element wi)
 /// vrev: N2 = [b1 b0 b3 b2 b5 b4 b7 b6]
 ///            [b0 b1 b2 b3 b4 b5 b6 b7]
 ///           +[b1 b0 b3 b2 b5 b4 b7 b6]
 /// N3=N1+N2 = [k0 k0 k1 k1 k2 k2 k3 k3] (k0 = b0+b1 = bit-count of 16-bit v0,
 /// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
   SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
   SDValue N1 = DAG.getNode(ISD::CTPOP, DL, VT8Bit, N0);
   SDValue N2 = DAG.getNode(ARMISD::VREV16, DL, VT8Bit, N1);
   SDValue N3 = DAG.getNode(ISD::ADD, DL, VT8Bit, N1, N2);
   return DAG.getNode(ARMISD::VUZP, DL, VT8Bit, N3, N3);
 }
 
 /// lowerCTPOP16BitElements - Returns a v4i16/v8i16 vector containing the
 /// bit-count for each 16-bit element from the operand.  We need slightly
 /// different sequencing for v4i16 and v8i16 to stay within NEON's available
 /// 64/128-bit registers.
 ///
 /// Trace for v4i16:
 /// input           = [v0    v1    v2    v3    ] (vi 16-bit element)
 /// v8i8: BitCounts = [k0 k1 k2 k3 k0 k1 k2 k3 ] (ki is the bit-count of vi)
 /// v8i16:Extended  = [k0    k1    k2    k3    k0    k1    k2    k3    ]
 /// v4i16:Extracted = [k0    k1    k2    k3    ]
 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
   if (VT.is64BitVector()) {
     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, BitCounts);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Extended,
                        DAG.getIntPtrConstant(0, DL));
   } else {
     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8,
                                     BitCounts, DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, Extracted);
   }
 }
 
 /// lowerCTPOP32BitElements - Returns a v2i32/v4i32 vector containing the
 /// bit-count for each 32-bit element from the operand.  The idea here is
 /// to split the vector into 16-bit elements, leverage the 16-bit count
 /// routine, and then combine the results.
 ///
 /// Trace for v2i32 (v4i32 similar with Extracted/Extended exchanged):
 /// input    = [v0    v1    ] (vi: 32-bit elements)
 /// Bitcast  = [w0 w1 w2 w3 ] (wi: 16-bit elements, v0 = [w0 w1])
 /// Counts16 = [k0 k1 k2 k3 ] (ki: 16-bit elements, bit-count of wi)
 /// vrev: N0 = [k1 k0 k3 k2 ]
 ///            [k0 k1 k2 k3 ]
 ///       N1 =+[k1 k0 k3 k2 ]
 ///            [k0 k2 k1 k3 ]
 ///       N2 =+[k1 k3 k0 k2 ]
 ///            [k0    k2    k1    k3    ]
 /// Extended =+[k1    k3    k0    k2    ]
 ///            [k0    k2    ]
 /// Extracted=+[k1    k3    ]
 ///
 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
   EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
 
   SDValue Bitcast = DAG.getNode(ISD::BITCAST, DL, VT16Bit, N->getOperand(0));
   SDValue Counts16 = lowerCTPOP16BitElements(Bitcast.getNode(), DAG);
   SDValue N0 = DAG.getNode(ARMISD::VREV32, DL, VT16Bit, Counts16);
   SDValue N1 = DAG.getNode(ISD::ADD, DL, VT16Bit, Counts16, N0);
   SDValue N2 = DAG.getNode(ARMISD::VUZP, DL, VT16Bit, N1, N1);
 
   if (VT.is64BitVector()) {
     SDValue Extended = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, N2);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Extended,
                        DAG.getIntPtrConstant(0, DL));
   } else {
     SDValue Extracted = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, N2,
                                     DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i32, Extracted);
   }
 }
 
 static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
                           const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
 
   assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
   assert((VT == MVT::v2i32 || VT == MVT::v4i32 ||
           VT == MVT::v4i16 || VT == MVT::v8i16) &&
          "Unexpected type for custom ctpop lowering");
 
   if (VT.getVectorElementType() == MVT::i32)
     return lowerCTPOP32BitElements(N, DAG);
   else
     return lowerCTPOP16BitElements(N, DAG);
 }
 
 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
                           const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
   if (!VT.isVector())
     return SDValue();
 
   // Lower vector shifts on NEON to use VSHL.
   assert(ST->hasNEON() && "unexpected vector shift");
 
   // Left shifts translate directly to the vshiftu intrinsic.
   if (N->getOpcode() == ISD::SHL)
     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                        DAG.getConstant(Intrinsic::arm_neon_vshiftu, dl,
                                        MVT::i32),
                        N->getOperand(0), N->getOperand(1));
 
   assert((N->getOpcode() == ISD::SRA ||
           N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
 
   // NEON uses the same intrinsics for both left and right shifts.  For
   // right shifts, the shift amounts are negative, so negate the vector of
   // shift amounts.
   EVT ShiftVT = N->getOperand(1).getValueType();
   SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
                                      getZeroVector(ShiftVT, DAG, dl),
                                      N->getOperand(1));
   Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
                              Intrinsic::arm_neon_vshifts :
                              Intrinsic::arm_neon_vshiftu);
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(vshiftInt, dl, MVT::i32),
                      N->getOperand(0), NegatedCount);
 }
 
 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
                                 const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
   // We can get here for a node like i32 = ISD::SHL i32, i64
   if (VT != MVT::i64)
     return SDValue();
 
   assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
          "Unknown shift to lower!");
 
   // We only lower SRA, SRL of 1 here, all others use generic lowering.
   if (!isOneConstant(N->getOperand(1)))
     return SDValue();
 
   // If we are in thumb mode, we don't have RRX.
   if (ST->isThumb1Only()) return SDValue();
 
   // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
                            DAG.getConstant(0, dl, MVT::i32));
   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
                            DAG.getConstant(1, dl, MVT::i32));
 
   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
   // captures the result into a carry flag.
   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
   Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
 
   // The low part is an ARMISD::RRX operand, which shifts the carry in.
   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
 
   // Merge the pieces into a single i64 value.
  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
 }
 
 static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   SDValue TmpOp0, TmpOp1;
   bool Invert = false;
   bool Swap = false;
   unsigned Opc = 0;
 
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
   EVT CmpVT = Op0.getValueType().changeVectorElementTypeToInteger();
   EVT VT = Op.getValueType();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
   SDLoc dl(Op);
 
   if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
       (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
     // Special-case integer 64-bit equality comparisons. They aren't legal,
     // but they can be lowered with a few vector instructions.
     unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
     EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
     SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
     SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
     SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
                               DAG.getCondCode(ISD::SETEQ));
     SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
     SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
     Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
     if (SetCCOpcode == ISD::SETNE)
       Merged = DAG.getNOT(dl, Merged, CmpVT);
     Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
     return Merged;
   }
 
   if (CmpVT.getVectorElementType() == MVT::i64)
     // 64-bit comparisons are not legal in general.
     return SDValue();
 
   if (Op1.getValueType().isFloatingPoint()) {
     switch (SetCCOpcode) {
     default: llvm_unreachable("Illegal FP comparison");
     case ISD::SETUNE:
     case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
     case ISD::SETOEQ:
     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
     case ISD::SETOLT:
     case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETOGT:
     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
     case ISD::SETOLE:
     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETOGE:
     case ISD::SETGE: Opc = ARMISD::VCGE; break;
     case ISD::SETUGE: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
     case ISD::SETUGT: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
     case ISD::SETUEQ: Invert = true; LLVM_FALLTHROUGH;
     case ISD::SETONE:
       // Expand this to (OLT | OGT).
       TmpOp0 = Op0;
       TmpOp1 = Op1;
       Opc = ISD::OR;
       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
       Op1 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp0, TmpOp1);
       break;
     case ISD::SETUO:
       Invert = true;
       LLVM_FALLTHROUGH;
     case ISD::SETO:
       // Expand this to (OLT | OGE).
       TmpOp0 = Op0;
       TmpOp1 = Op1;
       Opc = ISD::OR;
       Op0 = DAG.getNode(ARMISD::VCGT, dl, CmpVT, TmpOp1, TmpOp0);
       Op1 = DAG.getNode(ARMISD::VCGE, dl, CmpVT, TmpOp0, TmpOp1);
       break;
     }
   } else {
     // Integer comparisons.
     switch (SetCCOpcode) {
     default: llvm_unreachable("Illegal integer comparison");
     case ISD::SETNE:  Invert = true; LLVM_FALLTHROUGH;
     case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
     case ISD::SETLT:  Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETGT:  Opc = ARMISD::VCGT; break;
     case ISD::SETLE:  Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETGE:  Opc = ARMISD::VCGE; break;
     case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
     case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
     case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
     }
 
     // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
     if (Opc == ARMISD::VCEQ) {
       SDValue AndOp;
       if (ISD::isBuildVectorAllZeros(Op1.getNode()))
         AndOp = Op0;
       else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
         AndOp = Op1;
 
       // Ignore bitconvert.
       if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
         AndOp = AndOp.getOperand(0);
 
       if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
         Opc = ARMISD::VTST;
         Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
         Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
         Invert = !Invert;
       }
     }
   }
 
   if (Swap)
     std::swap(Op0, Op1);
 
   // If one of the operands is a constant vector zero, attempt to fold the
   // comparison to a specialized compare-against-zero form.
   SDValue SingleOp;
   if (ISD::isBuildVectorAllZeros(Op1.getNode()))
     SingleOp = Op0;
   else if (ISD::isBuildVectorAllZeros(Op0.getNode())) {
     if (Opc == ARMISD::VCGE)
       Opc = ARMISD::VCLEZ;
     else if (Opc == ARMISD::VCGT)
       Opc = ARMISD::VCLTZ;
     SingleOp = Op1;
   }
 
   SDValue Result;
   if (SingleOp.getNode()) {
     switch (Opc) {
     case ARMISD::VCEQ:
       Result = DAG.getNode(ARMISD::VCEQZ, dl, CmpVT, SingleOp); break;
     case ARMISD::VCGE:
       Result = DAG.getNode(ARMISD::VCGEZ, dl, CmpVT, SingleOp); break;
     case ARMISD::VCLEZ:
       Result = DAG.getNode(ARMISD::VCLEZ, dl, CmpVT, SingleOp); break;
     case ARMISD::VCGT:
       Result = DAG.getNode(ARMISD::VCGTZ, dl, CmpVT, SingleOp); break;
     case ARMISD::VCLTZ:
       Result = DAG.getNode(ARMISD::VCLTZ, dl, CmpVT, SingleOp); break;
     default:
       Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
     }
   } else {
      Result = DAG.getNode(Opc, dl, CmpVT, Op0, Op1);
   }
 
   Result = DAG.getSExtOrTrunc(Result, dl, VT);
 
   if (Invert)
     Result = DAG.getNOT(dl, Result, VT);
 
   return Result;
 }
 
 static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Carry = Op.getOperand(2);
   SDValue Cond = Op.getOperand(3);
   SDLoc DL(Op);
 
   assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
 
   // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
   // have to invert the carry first.
   Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
                       DAG.getConstant(1, DL, MVT::i32), Carry);
   // This converts the boolean value carry into the carry flag.
   Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
 
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, Carry);
 
   SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
   SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
   SDValue ARMcc = DAG.getConstant(
       IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM::CPSR,
                                    Cmp.getValue(1), SDValue());
   return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
                      CCR, Chain.getValue(1));
 }
 
 /// isNEONModifiedImm - Check if the specified splat value corresponds to a
 /// valid vector constant for a NEON instruction with a "modified immediate"
 /// operand (e.g., VMOV).  If so, return the encoded value.
 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
                                  unsigned SplatBitSize, SelectionDAG &DAG,
                                  const SDLoc &dl, EVT &VT, bool is128Bits,
                                  NEONModImmType type) {
   unsigned OpCmode, Imm;
 
   // SplatBitSize is set to the smallest size that splats the vector, so a
   // zero vector will always have SplatBitSize == 8.  However, NEON modified
   // immediate instructions others than VMOV do not support the 8-bit encoding
   // of a zero vector, and the default encoding of zero is supposed to be the
   // 32-bit version.
   if (SplatBits == 0)
     SplatBitSize = 32;
 
   switch (SplatBitSize) {
   case 8:
     if (type != VMOVModImm)
       return SDValue();
     // Any 1-byte value is OK.  Op=0, Cmode=1110.
     assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
     OpCmode = 0xe;
     Imm = SplatBits;
     VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
     break;
 
   case 16:
     // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
     VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
     if ((SplatBits & ~0xff) == 0) {
       // Value = 0x00nn: Op=x, Cmode=100x.
       OpCmode = 0x8;
       Imm = SplatBits;
       break;
     }
     if ((SplatBits & ~0xff00) == 0) {
       // Value = 0xnn00: Op=x, Cmode=101x.
       OpCmode = 0xa;
       Imm = SplatBits >> 8;
       break;
     }
     return SDValue();
 
   case 32:
     // NEON's 32-bit VMOV supports splat values where:
     // * only one byte is nonzero, or
     // * the least significant byte is 0xff and the second byte is nonzero, or
     // * the least significant 2 bytes are 0xff and the third is nonzero.
     VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
     if ((SplatBits & ~0xff) == 0) {
       // Value = 0x000000nn: Op=x, Cmode=000x.
       OpCmode = 0;
       Imm = SplatBits;
       break;
     }
     if ((SplatBits & ~0xff00) == 0) {
       // Value = 0x0000nn00: Op=x, Cmode=001x.
       OpCmode = 0x2;
       Imm = SplatBits >> 8;
       break;
     }
     if ((SplatBits & ~0xff0000) == 0) {
       // Value = 0x00nn0000: Op=x, Cmode=010x.
       OpCmode = 0x4;
       Imm = SplatBits >> 16;
       break;
     }
     if ((SplatBits & ~0xff000000) == 0) {
       // Value = 0xnn000000: Op=x, Cmode=011x.
       OpCmode = 0x6;
       Imm = SplatBits >> 24;
       break;
     }
 
     // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
     if (type == OtherModImm) return SDValue();
 
     if ((SplatBits & ~0xffff) == 0 &&
         ((SplatBits | SplatUndef) & 0xff) == 0xff) {
       // Value = 0x0000nnff: Op=x, Cmode=1100.
       OpCmode = 0xc;
       Imm = SplatBits >> 8;
       break;
     }
 
     if ((SplatBits & ~0xffffff) == 0 &&
         ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
       // Value = 0x00nnffff: Op=x, Cmode=1101.
       OpCmode = 0xd;
       Imm = SplatBits >> 16;
       break;
     }
 
     // Note: there are a few 32-bit splat values (specifically: 00ffff00,
     // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
     // VMOV.I32.  A (very) minor optimization would be to replicate the value
     // and fall through here to test for a valid 64-bit splat.  But, then the
     // caller would also need to check and handle the change in size.
     return SDValue();
 
   case 64: {
     if (type != VMOVModImm)
       return SDValue();
     // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
     uint64_t BitMask = 0xff;
     uint64_t Val = 0;
     unsigned ImmMask = 1;
     Imm = 0;
     for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
       if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
         Val |= BitMask;
         Imm |= ImmMask;
       } else if ((SplatBits & BitMask) != 0) {
         return SDValue();
       }
       BitMask <<= 8;
       ImmMask <<= 1;
     }
 
     if (DAG.getDataLayout().isBigEndian())
       // swap higher and lower 32 bit word
       Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
 
     // Op=1, Cmode=1110.
     OpCmode = 0x1e;
     VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
     break;
   }
 
   default:
     llvm_unreachable("unexpected size for isNEONModifiedImm");
   }
 
   unsigned EncodedVal = ARM_AM::createNEONModImm(OpCmode, Imm);
   return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
 }
 
 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
                                            const ARMSubtarget *ST) const {
   EVT VT = Op.getValueType();
   bool IsDouble = (VT == MVT::f64);
   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
   const APFloat &FPVal = CFP->getValueAPF();
 
   // Prevent floating-point constants from using literal loads
   // when execute-only is enabled.
   if (ST->genExecuteOnly()) {
     // If we can represent the constant as an immediate, don't lower it
     if (isFPImmLegal(FPVal, VT))
       return Op;
     // Otherwise, construct as integer, and move to float register
     APInt INTVal = FPVal.bitcastToAPInt();
     SDLoc DL(CFP);
     switch (VT.getSimpleVT().SimpleTy) {
       default:
         llvm_unreachable("Unknown floating point type!");
         break;
       case MVT::f64: {
         SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
         SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
         if (!ST->isLittle())
           std::swap(Lo, Hi);
         return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
       }
       case MVT::f32:
           return DAG.getNode(ARMISD::VMOVSR, DL, VT,
               DAG.getConstant(INTVal, DL, MVT::i32));
     }
   }
 
   if (!ST->hasVFP3())
     return SDValue();
 
   // Use the default (constant pool) lowering for double constants when we have
   // an SP-only FPU
   if (IsDouble && Subtarget->isFPOnlySP())
     return SDValue();
 
   // Try splatting with a VMOV.f32...
   int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
 
   if (ImmVal != -1) {
     if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
       // We have code in place to select a valid ConstantFP already, no need to
       // do any mangling.
       return Op;
     }
 
     // It's a float and we are trying to use NEON operations where
     // possible. Lower it to a splat followed by an extract.
     SDLoc DL(Op);
     SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
                                       NewVal);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
                        DAG.getConstant(0, DL, MVT::i32));
   }
 
   // The rest of our options are NEON only, make sure that's allowed before
   // proceeding..
   if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
     return SDValue();
 
   EVT VMovVT;
   uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
 
   // It wouldn't really be worth bothering for doubles except for one very
   // important value, which does happen to match: 0.0. So make sure we don't do
   // anything stupid.
   if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
     return SDValue();
 
   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
   SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
                                      VMovVT, false, VMOVModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
                                       NewVal);
     if (IsDouble)
       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
 
     // It's a float: cast and extract a vector element.
     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                        VecConstant);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
                        DAG.getConstant(0, DL, MVT::i32));
   }
 
   // Finally, try a VMVN.i32
   NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
                              false, VMVNModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
 
     if (IsDouble)
       return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
 
     // It's a float: cast and extract a vector element.
     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                        VecConstant);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
                        DAG.getConstant(0, DL, MVT::i32));
   }
 
   return SDValue();
 }
 
 // check if an VEXT instruction can handle the shuffle mask when the
 // vector sources of the shuffle are the same.
 static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
   if (M[0] < 0)
     return false;
 
   Imm = M[0];
 
   // If this is a VEXT shuffle, the immediate value is the index of the first
   // element.  The other shuffle indices must be the successive elements after
   // the first one.
   unsigned ExpectedElt = Imm;
   for (unsigned i = 1; i < NumElts; ++i) {
     // Increment the expected index.  If it wraps around, just follow it
     // back to index zero and keep going.
     ++ExpectedElt;
     if (ExpectedElt == NumElts)
       ExpectedElt = 0;
 
     if (M[i] < 0) continue; // ignore UNDEF indices
     if (ExpectedElt != static_cast<unsigned>(M[i]))
       return false;
   }
 
   return true;
 }
 
 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
                        bool &ReverseVEXT, unsigned &Imm) {
   unsigned NumElts = VT.getVectorNumElements();
   ReverseVEXT = false;
 
   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
   if (M[0] < 0)
     return false;
 
   Imm = M[0];
 
   // If this is a VEXT shuffle, the immediate value is the index of the first
   // element.  The other shuffle indices must be the successive elements after
   // the first one.
   unsigned ExpectedElt = Imm;
   for (unsigned i = 1; i < NumElts; ++i) {
     // Increment the expected index.  If it wraps around, it may still be
     // a VEXT but the source vectors must be swapped.
     ExpectedElt += 1;
     if (ExpectedElt == NumElts * 2) {
       ExpectedElt = 0;
       ReverseVEXT = true;
     }
 
     if (M[i] < 0) continue; // ignore UNDEF indices
     if (ExpectedElt != static_cast<unsigned>(M[i]))
       return false;
   }
 
   // Adjust the index value if the source operands will be swapped.
   if (ReverseVEXT)
     Imm -= NumElts;
 
   return true;
 }
 
 /// isVREVMask - Check if a vector shuffle corresponds to a VREV
 /// instruction with the specified blocksize.  (The order of the elements
 /// within each block of the vector is reversed.)
 static bool isVREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
   assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
          "Only possible block sizes for VREV are: 16, 32, 64");
 
   unsigned EltSz = VT.getScalarSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned BlockElts = M[0] + 1;
   // If the first shuffle index is UNDEF, be optimistic.
   if (M[0] < 0)
     BlockElts = BlockSize / EltSz;
 
   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
     return false;
 
   for (unsigned i = 0; i < NumElts; ++i) {
     if (M[i] < 0) continue; // ignore UNDEF indices
     if ((unsigned) M[i] != (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
       return false;
   }
 
   return true;
 }
 
 static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
   // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
   // range, then 0 is placed into the resulting vector. So pretty much any mask
   // of 8 elements can work here.
   return VT == MVT::v8i8 && M.size() == 8;
 }
 
 static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
                                unsigned Index) {
   if (Mask.size() == Elements * 2)
     return Index / Elements;
   return Mask[Index] == 0 ? 0 : 1;
 }
 
 // Checks whether the shuffle mask represents a vector transpose (VTRN) by
 // checking that pairs of elements in the shuffle mask represent the same index
 // in each vector, incrementing the expected index by 2 at each step.
 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
 //  v2={e,f,g,h}
 // WhichResult gives the offset for each element in the mask based on which
 // of the two results it belongs to.
 //
 // The transpose can be represented either as:
 // result1 = shufflevector v1, v2, result1_shuffle_mask
 // result2 = shufflevector v1, v2, result2_shuffle_mask
 // where v1/v2 and the shuffle masks have the same number of elements
 // (here WhichResult (see below) indicates which result is being checked)
 //
 // or as:
 // results = shufflevector v1, v2, shuffle_mask
 // where both results are returned in one vector and the shuffle mask has twice
 // as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
 // want to check the low half and high half of the shuffle mask as if it were
 // the other case
 static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned EltSz = VT.getScalarSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   if (M.size() != NumElts && M.size() != NumElts*2)
     return false;
 
   // If the mask is twice as long as the input vector then we need to check the
   // upper and lower parts of the mask with a matching value for WhichResult
   // FIXME: A mask with only even values will be rejected in case the first
   // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
   // M[0] is used to determine WhichResult
   for (unsigned i = 0; i < M.size(); i += NumElts) {
     WhichResult = SelectPairHalf(NumElts, M, i);
     for (unsigned j = 0; j < NumElts; j += 2) {
       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
         return false;
     }
   }
 
   if (M.size() == NumElts*2)
     WhichResult = 0;
 
   return true;
 }
 
 /// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
 static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
   unsigned EltSz = VT.getScalarSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   if (M.size() != NumElts && M.size() != NumElts*2)
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
     WhichResult = SelectPairHalf(NumElts, M, i);
     for (unsigned j = 0; j < NumElts; j += 2) {
       if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
         return false;
     }
   }
 
   if (M.size() == NumElts*2)
     WhichResult = 0;
 
   return true;
 }
 
 // Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
 // that the mask elements are either all even and in steps of size 2 or all odd
 // and in steps of size 2.
 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
 //  v2={e,f,g,h}
 // Requires similar checks to that of isVTRNMask with
 // respect the how results are returned.
 static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned EltSz = VT.getScalarSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   if (M.size() != NumElts && M.size() != NumElts*2)
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
     WhichResult = SelectPairHalf(NumElts, M, i);
     for (unsigned j = 0; j < NumElts; ++j) {
       if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
         return false;
     }
   }
 
   if (M.size() == NumElts*2)
     WhichResult = 0;
 
   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
   if (VT.is64BitVector() && EltSz == 32)
     return false;
 
   return true;
 }
 
 /// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
 static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
   unsigned EltSz = VT.getScalarSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   if (M.size() != NumElts && M.size() != NumElts*2)
     return false;
 
   unsigned Half = NumElts / 2;
   for (unsigned i = 0; i < M.size(); i += NumElts) {
     WhichResult = SelectPairHalf(NumElts, M, i);
     for (unsigned j = 0; j < NumElts; j += Half) {
       unsigned Idx = WhichResult;
       for (unsigned k = 0; k < Half; ++k) {
         int MIdx = M[i + j + k];
         if (MIdx >= 0 && (unsigned) MIdx != Idx)
           return false;
         Idx += 2;
       }
     }
   }
 
   if (M.size() == NumElts*2)
     WhichResult = 0;
 
   // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
   if (VT.is64BitVector() && EltSz == 32)
     return false;
 
   return true;
 }
 
 // Checks whether the shuffle mask represents a vector zip (VZIP) by checking
 // that pairs of elements of the shufflemask represent the same index in each
 // vector incrementing sequentially through the vectors.
 // e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
 //  v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
 //  v2={e,f,g,h}
 // Requires similar checks to that of isVTRNMask with respect the how results
 // are returned.
 static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned EltSz = VT.getScalarSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   if (M.size() != NumElts && M.size() != NumElts*2)
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
     WhichResult = SelectPairHalf(NumElts, M, i);
     unsigned Idx = WhichResult * NumElts / 2;
     for (unsigned j = 0; j < NumElts; j += 2) {
       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
         return false;
       Idx += 1;
     }
   }
 
   if (M.size() == NumElts*2)
     WhichResult = 0;
 
   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
   if (VT.is64BitVector() && EltSz == 32)
     return false;
 
   return true;
 }
 
 /// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
 static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
   unsigned EltSz = VT.getScalarSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   if (M.size() != NumElts && M.size() != NumElts*2)
     return false;
 
   for (unsigned i = 0; i < M.size(); i += NumElts) {
     WhichResult = SelectPairHalf(NumElts, M, i);
     unsigned Idx = WhichResult * NumElts / 2;
     for (unsigned j = 0; j < NumElts; j += 2) {
       if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
           (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
         return false;
       Idx += 1;
     }
   }
 
   if (M.size() == NumElts*2)
     WhichResult = 0;
 
   // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
   if (VT.is64BitVector() && EltSz == 32)
     return false;
 
   return true;
 }
 
 /// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
 /// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
 static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
                                            unsigned &WhichResult,
                                            bool &isV_UNDEF) {
   isV_UNDEF = false;
   if (isVTRNMask(ShuffleMask, VT, WhichResult))
     return ARMISD::VTRN;
   if (isVUZPMask(ShuffleMask, VT, WhichResult))
     return ARMISD::VUZP;
   if (isVZIPMask(ShuffleMask, VT, WhichResult))
     return ARMISD::VZIP;
 
   isV_UNDEF = true;
   if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
     return ARMISD::VTRN;
   if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
     return ARMISD::VUZP;
   if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
     return ARMISD::VZIP;
 
   return 0;
 }
 
 /// \return true if this is a reverse operation on an vector.
 static bool isReverseMask(ArrayRef<int> M, EVT VT) {
   unsigned NumElts = VT.getVectorNumElements();
   // Make sure the mask has the right size.
   if (NumElts != M.size())
       return false;
 
   // Look for <15, ..., 3, -1, 1, 0>.
   for (unsigned i = 0; i != NumElts; ++i)
     if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
       return false;
 
   return true;
 }
 
 // If N is an integer constant that can be moved into a register in one
 // instruction, return an SDValue of such a constant (will become a MOV
 // instruction).  Otherwise return null.
 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
                                      const ARMSubtarget *ST, const SDLoc &dl) {
   uint64_t Val;
   if (!isa<ConstantSDNode>(N))
     return SDValue();
   Val = cast<ConstantSDNode>(N)->getZExtValue();
 
   if (ST->isThumb1Only()) {
     if (Val <= 255 || ~Val <= 255)
       return DAG.getConstant(Val, dl, MVT::i32);
   } else {
     if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
       return DAG.getConstant(Val, dl, MVT::i32);
   }
   return SDValue();
 }
 
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.
 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                                              const ARMSubtarget *ST) const {
   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatUndef.isAllOnesValue())
       return DAG.getUNDEF(VT);
 
     if (SplatBitSize <= 64) {
       // Check if an immediate VMOV works.
       EVT VmovVT;
       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
                                       SplatUndef.getZExtValue(), SplatBitSize,
                                       DAG, dl, VmovVT, VT.is128BitVector(),
                                       VMOVModImm);
       if (Val.getNode()) {
         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
       }
 
       // Try an immediate VMVN.
       uint64_t NegatedImm = (~SplatBits).getZExtValue();
       Val = isNEONModifiedImm(NegatedImm,
                                       SplatUndef.getZExtValue(), SplatBitSize,
                                       DAG, dl, VmovVT, VT.is128BitVector(),
                                       VMVNModImm);
       if (Val.getNode()) {
         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
       }
 
       // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
       if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
         int ImmVal = ARM_AM::getFP32Imm(SplatBits);
         if (ImmVal != -1) {
           SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
           return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
         }
       }
     }
   }
 
   // Scan through the operands to see if only one value is used.
   //
   // As an optimisation, even if more than one value is used it may be more
   // profitable to splat with one value then change some lanes.
   //
   // Heuristically we decide to do this if the vector has a "dominant" value,
   // defined as splatted to more than half of the lanes.
   unsigned NumElts = VT.getVectorNumElements();
   bool isOnlyLowElement = true;
   bool usesOnlyOneValue = true;
   bool hasDominantValue = false;
   bool isConstant = true;
 
   // Map of the number of times a particular SDValue appears in the
   // element list.
   DenseMap<SDValue, unsigned> ValueCounts;
   SDValue Value;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.isUndef())
       continue;
     if (i > 0)
       isOnlyLowElement = false;
     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
       isConstant = false;
 
     ValueCounts.insert(std::make_pair(V, 0));
     unsigned &Count = ValueCounts[V];
 
     // Is this value dominant? (takes up more than half of the lanes)
     if (++Count > (NumElts / 2)) {
       hasDominantValue = true;
       Value = V;
     }
   }
   if (ValueCounts.size() != 1)
     usesOnlyOneValue = false;
   if (!Value.getNode() && !ValueCounts.empty())
     Value = ValueCounts.begin()->first;
 
   if (ValueCounts.empty())
     return DAG.getUNDEF(VT);
 
   // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
   // Keep going if we are hitting this case.
   if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
 
   unsigned EltSize = VT.getScalarSizeInBits();
 
   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
   // i32 and try again.
   if (hasDominantValue && EltSize <= 32) {
     if (!isConstant) {
       SDValue N;
 
       // If we are VDUPing a value that comes directly from a vector, that will
       // cause an unnecessary move to and from a GPR, where instead we could
       // just use VDUPLANE. We can only do this if the lane being extracted
       // is at a constant index, as the VDUP from lane instructions only have
       // constant-index forms.
       ConstantSDNode *constIndex;
       if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
           (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
         // We need to create a new undef vector to use for the VDUPLANE if the
         // size of the vector from which we get the value is different than the
         // size of the vector that we need to create. We will insert the element
         // such that the register coalescer will remove unnecessary copies.
         if (VT != Value->getOperand(0).getValueType()) {
           unsigned index = constIndex->getAPIntValue().getLimitedValue() %
                              VT.getVectorNumElements();
           N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
                  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
                         Value, DAG.getConstant(index, dl, MVT::i32)),
                            DAG.getConstant(index, dl, MVT::i32));
         } else
           N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
                         Value->getOperand(0), Value->getOperand(1));
       } else
         N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
 
       if (!usesOnlyOneValue) {
         // The dominant value was splatted as 'N', but we now have to insert
         // all differing elements.
         for (unsigned I = 0; I < NumElts; ++I) {
           if (Op.getOperand(I) == Value)
             continue;
           SmallVector<SDValue, 3> Ops;
           Ops.push_back(N);
           Ops.push_back(Op.getOperand(I));
           Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
           N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
         }
       }
       return N;
     }
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       for (unsigned i = 0; i < NumElts; ++i)
         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
                                   Op.getOperand(i)));
       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
       Val = LowerBUILD_VECTOR(Val, DAG, ST);
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
     }
     if (usesOnlyOneValue) {
       SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
       if (isConstant && Val.getNode())
         return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
     }
   }
 
   // If all elements are constants and the case above didn't get hit, fall back
   // to the default expansion, which will generate a load from the constant
   // pool.
   if (isConstant)
     return SDValue();
 
   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
   if (NumElts >= 4) {
     SDValue shuffle = ReconstructShuffle(Op, DAG);
     if (shuffle != SDValue())
       return shuffle;
   }
 
   if (VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
     // If we haven't found an efficient lowering, try splitting a 128-bit vector
     // into two 64-bit vectors; we might discover a better way to lower it.
     SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
     EVT ExtVT = VT.getVectorElementType();
     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
     SDValue Lower =
         DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElts / 2));
     if (Lower.getOpcode() == ISD::BUILD_VECTOR)
       Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
     SDValue Upper = DAG.getBuildVector(
         HVT, dl, makeArrayRef(&Ops[NumElts / 2], NumElts / 2));
     if (Upper.getOpcode() == ISD::BUILD_VECTOR)
       Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
     if (Lower && Upper)
       return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
   }
 
   // Vectors with 32- or 64-bit elements can be built by directly assigning
   // the subregisters.  Lower it to an ARMISD::BUILD_VECTOR so the operands
   // will be legalized.
   if (EltSize >= 32) {
     // Do the expansion with floating-point types, since that is what the VFP
     // registers are defined to use, and since i64 is not legal.
     EVT EltVT = EVT::getFloatingPointVT(EltSize);
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0; i < NumElts; ++i)
       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
   // know the default expansion would otherwise fall back on something even
   // worse. For a vector with one or two non-undef values, that's
   // scalar_to_vector for the elements followed by a shuffle (provided the
   // shuffle is valid for the target) and materialization element by element
   // on the stack followed by a load for everything else.
   if (!isConstant && !usesOnlyOneValue) {
     SDValue Vec = DAG.getUNDEF(VT);
     for (unsigned i = 0 ; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       if (V.isUndef())
         continue;
       SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
     }
     return Vec;
   }
 
   return SDValue();
 }
 
 // Gather data to see if the operation can be modelled as a
 // shuffle in combination with VEXTs.
 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
                                               SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
   struct ShuffleSourceInfo {
     SDValue Vec;
     unsigned MinElt = std::numeric_limits<unsigned>::max();
     unsigned MaxElt = 0;
 
     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
     // be compatible with the shuffle we intend to construct. As a result
     // ShuffleVec will be some sliding window into the original Vec.
     SDValue ShuffleVec;
 
     // Code should guarantee that element i in Vec starts at element "WindowBase
     // + i * WindowScale in ShuffleVec".
     int WindowBase = 0;
     int WindowScale = 1;
 
     ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
 
     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
   };
 
   // First gather all vectors used as an immediate source for this BUILD_VECTOR
   // node.
   SmallVector<ShuffleSourceInfo, 2> Sources;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.isUndef())
       continue;
     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
       // A shuffle can only come from building a vector from various
       // elements of other vectors.
       return SDValue();
     } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
       // Furthermore, shuffles require a constant mask, whereas extractelts
       // accept variable indices.
       return SDValue();
     }
 
     // Add this element source to the list if it's not already there.
     SDValue SourceVec = V.getOperand(0);
     auto Source = llvm::find(Sources, SourceVec);
     if (Source == Sources.end())
       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
     // Update the minimum and maximum lane number seen.
     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
     Source->MinElt = std::min(Source->MinElt, EltNo);
     Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
 
   // Currently only do something sane when at most two source vectors
   // are involved.
   if (Sources.size() > 2)
     return SDValue();
 
   // Find out the smallest element size among result and two sources, and use
   // it as element size to build the shuffle_vector.
   EVT SmallestEltTy = VT.getVectorElementType();
   for (auto &Source : Sources) {
     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
     if (SrcEltTy.bitsLT(SmallestEltTy))
       SmallestEltTy = SrcEltTy;
   }
   unsigned ResMultiplier =
       VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
 
   // If the source vector is too wide or too narrow, we may nevertheless be able
   // to construct a compatible shuffle either by concatenating it with UNDEF or
   // extracting a suitable range of elements.
   for (auto &Src : Sources) {
     EVT SrcVT = Src.ShuffleVec.getValueType();
 
     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
       continue;
 
     // This stage of the search produces a source with the same element type as
     // the original, but with a total width matching the BUILD_VECTOR output.
     EVT EltVT = SrcVT.getVectorElementType();
     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
 
     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
       if (2 * SrcVT.getSizeInBits() != VT.getSizeInBits())
         return SDValue();
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
       Src.ShuffleVec =
           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
       continue;
     }
 
     if (SrcVT.getSizeInBits() != 2 * VT.getSizeInBits())
       return SDValue();
 
     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
       // Span too large for a VEXT to cope
       return SDValue();
     }
 
     if (Src.MinElt >= NumSrcElts) {
       // The extraction can just take the second half
       Src.ShuffleVec =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
       Src.WindowBase = -NumSrcElts;
     } else if (Src.MaxElt < NumSrcElts) {
       // The extraction can just take the first half
       Src.ShuffleVec =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(0, dl, MVT::i32));
     } else {
       // An actual VEXT is needed
       SDValue VEXTSrc1 =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(0, dl, MVT::i32));
       SDValue VEXTSrc2 =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(NumSrcElts, dl, MVT::i32));
 
       Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
                                    VEXTSrc2,
                                    DAG.getConstant(Src.MinElt, dl, MVT::i32));
       Src.WindowBase = -Src.MinElt;
     }
   }
 
   // Another possible incompatibility occurs from the vector element types. We
   // can fix this by bitcasting the source vectors to the same type we intend
   // for the shuffle.
   for (auto &Src : Sources) {
     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
     if (SrcEltTy == SmallestEltTy)
       continue;
     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
     Src.WindowBase *= Src.WindowScale;
   }
 
   // Final sanity check before we try to actually produce a shuffle.
   LLVM_DEBUG(for (auto Src
                   : Sources)
                  assert(Src.ShuffleVec.getValueType() == ShuffleVT););
 
   // The stars all align, our next step is to produce the mask for the shuffle.
   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
   int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
     SDValue Entry = Op.getOperand(i);
     if (Entry.isUndef())
       continue;
 
     auto Src = llvm::find(Sources, Entry.getOperand(0));
     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
 
     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
     // segment.
     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
                                VT.getScalarSizeInBits());
     int LanesDefined = BitsDefined / BitsPerShuffleLane;
 
     // This source is expected to fill ResMultiplier lanes of the final shuffle,
     // starting at the appropriate offset.
     int *LaneMask = &Mask[i * ResMultiplier];
 
     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
     ExtractBase += NumElts * (Src - Sources.begin());
     for (int j = 0; j < LanesDefined; ++j)
       LaneMask[j] = ExtractBase + j;
   }
 
   // Final check before we try to produce nonsense...
   if (!isShuffleMaskLegal(Mask, ShuffleVT))
     return SDValue();
 
   // We can't handle more than two sources. This should have already
   // been checked before this point.
   assert(Sources.size() <= 2 && "Too many sources!");
 
   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
   for (unsigned i = 0; i < Sources.size(); ++i)
     ShuffleOps[i] = Sources[i].ShuffleVec;
 
   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
                                          ShuffleOps[1], Mask);
   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
 /// isShuffleMaskLegal - Targets can use this to indicate that they only
 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
 /// are assumed to be legal.
 bool ARMTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
   if (VT.getVectorNumElements() == 4 &&
       (VT.is128BitVector() || VT.is64BitVector())) {
     unsigned PFIndexes[4];
     for (unsigned i = 0; i != 4; ++i) {
       if (M[i] < 0)
         PFIndexes[i] = 8;
       else
         PFIndexes[i] = M[i];
     }
 
     // Compute the index in the perfect shuffle table.
     unsigned PFTableIndex =
       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
     unsigned Cost = (PFEntry >> 30);
 
     if (Cost <= 4)
       return true;
   }
 
   bool ReverseVEXT, isV_UNDEF;
   unsigned Imm, WhichResult;
 
   unsigned EltSize = VT.getScalarSizeInBits();
   return (EltSize >= 32 ||
           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
           isVREVMask(M, VT, 64) ||
           isVREVMask(M, VT, 32) ||
           isVREVMask(M, VT, 16) ||
           isVEXTMask(M, VT, ReverseVEXT, Imm) ||
           isVTBLMask(M, VT) ||
           isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF) ||
           ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(M, VT)));
 }
 
 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
                                       const SDLoc &dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
 
   enum {
     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
     OP_VREV,
     OP_VDUP0,
     OP_VDUP1,
     OP_VDUP2,
     OP_VDUP3,
     OP_VEXT1,
     OP_VEXT2,
     OP_VEXT3,
     OP_VUZPL, // VUZP, left result
     OP_VUZPR, // VUZP, right result
     OP_VZIPL, // VZIP, left result
     OP_VZIPR, // VZIP, right result
     OP_VTRNL, // VTRN, left result
     OP_VTRNR  // VTRN, right result
   };
 
   if (OpNum == OP_COPY) {
     if (LHSID == (1*9+2)*9+3) return LHS;
     assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
     return RHS;
   }
 
   SDValue OpLHS, OpRHS;
   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
   EVT VT = OpLHS.getValueType();
 
   switch (OpNum) {
   default: llvm_unreachable("Unknown shuffle opcode!");
   case OP_VREV:
     // VREV divides the vector in half and swaps within the half.
     if (VT.getVectorElementType() == MVT::i32 ||
         VT.getVectorElementType() == MVT::f32)
       return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
     // vrev <4 x i16> -> VREV32
     if (VT.getVectorElementType() == MVT::i16)
       return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
     // vrev <4 x i8> -> VREV16
     assert(VT.getVectorElementType() == MVT::i8);
     return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
   case OP_VDUP0:
   case OP_VDUP1:
   case OP_VDUP2:
   case OP_VDUP3:
     return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
                        OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
   case OP_VEXT1:
   case OP_VEXT2:
   case OP_VEXT3:
     return DAG.getNode(ARMISD::VEXT, dl, VT,
                        OpLHS, OpRHS,
                        DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
   case OP_VUZPL:
   case OP_VUZPR:
     return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
                        OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
   case OP_VZIPL:
   case OP_VZIPR:
     return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
                        OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
   case OP_VTRNL:
   case OP_VTRNR:
     return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
                        OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
   }
 }
 
 static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
                                        ArrayRef<int> ShuffleMask,
                                        SelectionDAG &DAG) {
   // Check to see if we can use the VTBL instruction.
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   SDLoc DL(Op);
 
   SmallVector<SDValue, 8> VTBLMask;
   for (ArrayRef<int>::iterator
          I = ShuffleMask.begin(), E = ShuffleMask.end(); I != E; ++I)
     VTBLMask.push_back(DAG.getConstant(*I, DL, MVT::i32));
 
   if (V2.getNode()->isUndef())
     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
                        DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
 
   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
                      DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
 }
 
 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
                                                       SelectionDAG &DAG) {
   SDLoc DL(Op);
   SDValue OpLHS = Op.getOperand(0);
   EVT VT = OpLHS.getValueType();
 
   assert((VT == MVT::v8i16 || VT == MVT::v16i8) &&
          "Expect an v8i16/v16i8 type");
   OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, OpLHS);
   // For a v16i8 type: After the VREV, we have got <8, ...15, 8, ..., 0>. Now,
   // extract the first 8 bytes into the top double word and the last 8 bytes
   // into the bottom double word. The v8i16 case is similar.
   unsigned ExtractNum = (VT == MVT::v16i8) ? 8 : 4;
   return DAG.getNode(ARMISD::VEXT, DL, VT, OpLHS, OpLHS,
                      DAG.getConstant(ExtractNum, DL, MVT::i32));
 }
 
 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
 
   // Convert shuffles that are directly supported on NEON to target-specific
   // DAG nodes, instead of keeping them as shuffles and matching them again
   // during code selection.  This is more efficient and avoids the possibility
   // of inconsistencies between legalization and selection.
   // FIXME: floating-point vectors should be canonicalized to integer vectors
   // of the same time so that they get CSEd properly.
   ArrayRef<int> ShuffleMask = SVN->getMask();
 
   unsigned EltSize = VT.getScalarSizeInBits();
   if (EltSize <= 32) {
     if (SVN->isSplat()) {
       int Lane = SVN->getSplatIndex();
       // If this is undef splat, generate it via "just" vdup, if possible.
       if (Lane == -1) Lane = 0;
 
       // Test if V1 is a SCALAR_TO_VECTOR.
       if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
         return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
       }
       // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
       // (and probably will turn into a SCALAR_TO_VECTOR once legalization
       // reaches it).
       if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
           !isa<ConstantSDNode>(V1.getOperand(0))) {
         bool IsScalarToVector = true;
         for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
           if (!V1.getOperand(i).isUndef()) {
             IsScalarToVector = false;
             break;
           }
         if (IsScalarToVector)
           return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
       }
       return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
                          DAG.getConstant(Lane, dl, MVT::i32));
     }
 
     bool ReverseVEXT;
     unsigned Imm;
     if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
       if (ReverseVEXT)
         std::swap(V1, V2);
       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
                          DAG.getConstant(Imm, dl, MVT::i32));
     }
 
     if (isVREVMask(ShuffleMask, VT, 64))
       return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
     if (isVREVMask(ShuffleMask, VT, 32))
       return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
     if (isVREVMask(ShuffleMask, VT, 16))
       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
 
     if (V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
       return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
                          DAG.getConstant(Imm, dl, MVT::i32));
     }
 
     // Check for Neon shuffles that modify both input vectors in place.
     // If both results are used, i.e., if there are two shuffles with the same
     // source operands and with masks corresponding to both results of one of
     // these operations, DAG memoization will ensure that a single node is
     // used for both shuffles.
     unsigned WhichResult;
     bool isV_UNDEF;
     if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
             ShuffleMask, VT, WhichResult, isV_UNDEF)) {
       if (isV_UNDEF)
         V2 = V1;
       return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
           .getValue(WhichResult);
     }
 
     // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
     // shuffles that produce a result larger than their operands with:
     //   shuffle(concat(v1, undef), concat(v2, undef))
     // ->
     //   shuffle(concat(v1, v2), undef)
     // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
     //
     // This is useful in the general case, but there are special cases where
     // native shuffles produce larger results: the two-result ops.
     //
     // Look through the concat when lowering them:
     //   shuffle(concat(v1, v2), undef)
     // ->
     //   concat(VZIP(v1, v2):0, :1)
     //
     if (V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
       SDValue SubV1 = V1->getOperand(0);
       SDValue SubV2 = V1->getOperand(1);
       EVT SubVT = SubV1.getValueType();
 
       // We expect these to have been canonicalized to -1.
       assert(llvm::all_of(ShuffleMask, [&](int i) {
         return i < (int)VT.getVectorNumElements();
       }) && "Unexpected shuffle index into UNDEF operand!");
 
       if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
               ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
         if (isV_UNDEF)
           SubV2 = SubV1;
         assert((WhichResult == 0) &&
                "In-place shuffle of concat can only have one result!");
         SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
                                   SubV1, SubV2);
         return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
                            Res.getValue(1));
       }
     }
   }
 
   // If the shuffle is not directly supported and it has 4 elements, use
   // the PerfectShuffle-generated table to synthesize it from other shuffles.
   unsigned NumElts = VT.getVectorNumElements();
   if (NumElts == 4) {
     unsigned PFIndexes[4];
     for (unsigned i = 0; i != 4; ++i) {
       if (ShuffleMask[i] < 0)
         PFIndexes[i] = 8;
       else
         PFIndexes[i] = ShuffleMask[i];
     }
 
     // Compute the index in the perfect shuffle table.
     unsigned PFTableIndex =
       PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
     unsigned Cost = (PFEntry >> 30);
 
     if (Cost <= 4)
       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   }
 
   // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
   if (EltSize >= 32) {
     // Do the expansion with floating-point types, since that is what the VFP
     // registers are defined to use, and since i64 is not legal.
     EVT EltVT = EVT::getFloatingPointVT(EltSize);
     EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
     V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
     V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0; i < NumElts; ++i) {
       if (ShuffleMask[i] < 0)
         Ops.push_back(DAG.getUNDEF(EltVT));
       else
         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
                                   ShuffleMask[i] < (int)NumElts ? V1 : V2,
                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
                                                   dl, MVT::i32)));
     }
     SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
   if ((VT == MVT::v8i16 || VT == MVT::v16i8) && isReverseMask(ShuffleMask, VT))
     return LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(Op, DAG);
 
   if (VT == MVT::v8i8)
     if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
       return NewOp;
 
   return SDValue();
 }
 
 static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   // INSERT_VECTOR_ELT is legal only for immediate indexes.
   SDValue Lane = Op.getOperand(2);
   if (!isa<ConstantSDNode>(Lane))
     return SDValue();
 
   return Op;
 }
 
 static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
   SDValue Lane = Op.getOperand(1);
   if (!isa<ConstantSDNode>(Lane))
     return SDValue();
 
   SDValue Vec = Op.getOperand(0);
   if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
     SDLoc dl(Op);
     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
   }
 
   return Op;
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   // The only time a CONCAT_VECTORS operation can have legal types is when
   // two 64-bit vectors are concatenated to a 128-bit vector.
   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
          "unexpected CONCAT_VECTORS");
   SDLoc dl(Op);
   SDValue Val = DAG.getUNDEF(MVT::v2f64);
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   if (!Op0.isUndef())
     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
                       DAG.getIntPtrConstant(0, dl));
   if (!Op1.isUndef())
     Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
                       DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
                       DAG.getIntPtrConstant(1, dl));
   return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
 }
 
 /// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
 /// element has been zero/sign-extended, depending on the isSigned parameter,
 /// from an integer type half its size.
 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
                                    bool isSigned) {
   // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
   EVT VT = N->getValueType(0);
   if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
     SDNode *BVN = N->getOperand(0).getNode();
     if (BVN->getValueType(0) != MVT::v4i32 ||
         BVN->getOpcode() != ISD::BUILD_VECTOR)
       return false;
     unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
     unsigned HiElt = 1 - LoElt;
     ConstantSDNode *Lo0 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt));
     ConstantSDNode *Hi0 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt));
     ConstantSDNode *Lo1 = dyn_cast<ConstantSDNode>(BVN->getOperand(LoElt+2));
     ConstantSDNode *Hi1 = dyn_cast<ConstantSDNode>(BVN->getOperand(HiElt+2));
     if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
       return false;
     if (isSigned) {
       if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
           Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
         return true;
     } else {
       if (Hi0->isNullValue() && Hi1->isNullValue())
         return true;
     }
     return false;
   }
 
   if (N->getOpcode() != ISD::BUILD_VECTOR)
     return false;
 
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     SDNode *Elt = N->getOperand(i).getNode();
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
       unsigned EltSize = VT.getScalarSizeInBits();
       unsigned HalfSize = EltSize / 2;
       if (isSigned) {
         if (!isIntN(HalfSize, C->getSExtValue()))
           return false;
       } else {
         if (!isUIntN(HalfSize, C->getZExtValue()))
           return false;
       }
       continue;
     }
     return false;
   }
 
   return true;
 }
 
 /// isSignExtended - Check if a node is a vector value that is sign-extended
 /// or a constant BUILD_VECTOR with sign-extended elements.
 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
     return true;
   if (isExtendedBUILD_VECTOR(N, DAG, true))
     return true;
   return false;
 }
 
 /// isZeroExtended - Check if a node is a vector value that is zero-extended
 /// or a constant BUILD_VECTOR with zero-extended elements.
 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() == ISD::ZERO_EXTEND || ISD::isZEXTLoad(N))
     return true;
   if (isExtendedBUILD_VECTOR(N, DAG, false))
     return true;
   return false;
 }
 
 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
   if (OrigVT.getSizeInBits() >= 64)
     return OrigVT;
 
   assert(OrigVT.isSimple() && "Expecting a simple value type");
 
   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
   switch (OrigSimpleTy) {
   default: llvm_unreachable("Unexpected Vector Type");
   case MVT::v2i8:
   case MVT::v2i16:
      return MVT::v2i32;
   case MVT::v4i8:
     return  MVT::v4i16;
   }
 }
 
 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
 /// We insert the required extension here to get the vector to fill a D register.
 static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
                                             const EVT &OrigTy,
                                             const EVT &ExtTy,
                                             unsigned ExtOpcode) {
   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
   // 64-bits we need to insert a new extension so that it will be 64-bits.
   assert(ExtTy.is128BitVector() && "Unexpected extension size");
   if (OrigTy.getSizeInBits() >= 64)
     return N;
 
   // Must extend size to at least 64 bits to be used as an operand for VMULL.
   EVT NewVT = getExtensionTo64Bits(OrigTy);
 
   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
 }
 
 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
 /// does not do any sign/zero extension. If the original vector is less
 /// than 64 bits, an appropriate extension will be added after the load to
 /// reach a total size of 64 bits. We have to add the extension separately
 /// because ARM does not have a sign/zero extending load for vectors.
 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
   EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
 
   // The load already has the right type.
   if (ExtendedTy == LD->getMemoryVT())
     return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
                        LD->getBasePtr(), LD->getPointerInfo(),
                        LD->getAlignment(), LD->getMemOperand()->getFlags());
 
   // We need to create a zextload/sextload. We cannot just create a load
   // followed by a zext/zext node because LowerMUL is also run during normal
   // operation legalization where we can't create illegal types.
   return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
                         LD->getMemoryVT(), LD->getAlignment(),
                         LD->getMemOperand()->getFlags());
 }
 
 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
 /// extending load, or BUILD_VECTOR with extended elements, return the
 /// unextended value. The unextended vector should be 64 bits so that it can
 /// be used as an operand to a VMULL instruction. If the original vector size
 /// before extension is less than 64 bits we add a an extension to resize
 /// the vector to 64 bits.
 static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
     return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
                                         N->getOperand(0)->getValueType(0),
                                         N->getValueType(0),
                                         N->getOpcode());
 
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
            "Expected extending load");
 
     SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
     unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     SDValue extLoad =
         DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
 
     return newLoad;
   }
 
   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
   // have been legalized as a BITCAST from v4i32.
   if (N->getOpcode() == ISD::BITCAST) {
     SDNode *BVN = N->getOperand(0).getNode();
     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
     unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
     return DAG.getBuildVector(
         MVT::v2i32, SDLoc(N),
         {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
   }
   // Construct a new BUILD_VECTOR with elements truncated to half the size.
   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
   EVT VT = N->getValueType(0);
   unsigned EltSize = VT.getScalarSizeInBits() / 2;
   unsigned NumElts = VT.getVectorNumElements();
   MVT TruncVT = MVT::getIntegerVT(EltSize);
   SmallVector<SDValue, 8> Ops;
   SDLoc dl(N);
   for (unsigned i = 0; i != NumElts; ++i) {
     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
     const APInt &CInt = C->getAPIntValue();
     // Element types smaller than 32 bits are not legal, so use i32 elements.
     // The values are implicitly truncated so sext vs. zext doesn't matter.
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
   }
   return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
 }
 
 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
   unsigned Opcode = N->getOpcode();
   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
     SDNode *N0 = N->getOperand(0).getNode();
     SDNode *N1 = N->getOperand(1).getNode();
     return N0->hasOneUse() && N1->hasOneUse() &&
       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
   }
   return false;
 }
 
 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
   unsigned Opcode = N->getOpcode();
   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
     SDNode *N0 = N->getOperand(0).getNode();
     SDNode *N1 = N->getOperand(1).getNode();
     return N0->hasOneUse() && N1->hasOneUse() &&
       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
   }
   return false;
 }
 
 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   // Multiplications are only custom-lowered for 128-bit vectors so that
   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
   EVT VT = Op.getValueType();
   assert(VT.is128BitVector() && VT.isInteger() &&
          "unexpected type for custom-lowering ISD::MUL");
   SDNode *N0 = Op.getOperand(0).getNode();
   SDNode *N1 = Op.getOperand(1).getNode();
   unsigned NewOpc = 0;
   bool isMLA = false;
   bool isN0SExt = isSignExtended(N0, DAG);
   bool isN1SExt = isSignExtended(N1, DAG);
   if (isN0SExt && isN1SExt)
     NewOpc = ARMISD::VMULLs;
   else {
     bool isN0ZExt = isZeroExtended(N0, DAG);
     bool isN1ZExt = isZeroExtended(N1, DAG);
     if (isN0ZExt && isN1ZExt)
       NewOpc = ARMISD::VMULLu;
     else if (isN1SExt || isN1ZExt) {
       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
       if (isN1SExt && isAddSubSExt(N0, DAG)) {
         NewOpc = ARMISD::VMULLs;
         isMLA = true;
       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
         NewOpc = ARMISD::VMULLu;
         isMLA = true;
       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
         std::swap(N0, N1);
         NewOpc = ARMISD::VMULLu;
         isMLA = true;
       }
     }
 
     if (!NewOpc) {
       if (VT == MVT::v2i64)
         // Fall through to expand this.  It is not legal.
         return SDValue();
       else
         // Other vector multiplications are legal.
         return Op;
     }
   }
 
   // Legalize to a VMULL instruction.
   SDLoc DL(Op);
   SDValue Op0;
   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
   if (!isMLA) {
     Op0 = SkipExtensionForVMULL(N0, DAG);
     assert(Op0.getValueType().is64BitVector() &&
            Op1.getValueType().is64BitVector() &&
            "unexpected types for extended operands to VMULL");
     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
   }
 
   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
   // isel lowering to take advantage of no-stall back to back vmul + vmla.
   //   vmull q0, d4, d6
   //   vmlal q0, d5, d6
   // is faster than
   //   vaddl q0, d4, d5
   //   vmovl q1, d6
   //   vmul  q0, q0, q1
   SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
   SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
   EVT Op1VT = Op1.getValueType();
   return DAG.getNode(N0->getOpcode(), DL, VT,
                      DAG.getNode(NewOpc, DL, VT,
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
                      DAG.getNode(NewOpc, DL, VT,
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
 static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl,
                               SelectionDAG &DAG) {
   // TODO: Should this propagate fast-math-flags?
 
   // Convert to float
   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
   X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
   Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
   X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
   Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
   // Get reciprocal estimate.
   // float4 recip = vrecpeq_f32(yf);
   Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
                    Y);
   // Because char has a smaller range than uchar, we can actually get away
   // without any newton steps.  This requires that we use a weird bias
   // of 0xb000, however (again, this has been exhaustively tested).
   // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
   X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
   Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
   X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
   X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
   // Convert back to short.
   X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
   X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
   return X;
 }
 
 static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl,
                                SelectionDAG &DAG) {
   // TODO: Should this propagate fast-math-flags?
 
   SDValue N2;
   // Convert to float.
   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
   // float4 xf = vcvt_f32_s32(vmovl_s16(x));
   N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
   N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
   N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
 
   // Use reciprocal estimate and one refinement step.
   // float4 recip = vrecpeq_f32(yf);
   // recip *= vrecpsq_f32(yf, recip);
   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
                    N1);
   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
                    N1, N2);
   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
   // Because short has a smaller range than ushort, we can actually get away
   // with only a single newton step.  This requires that we use a weird bias
   // of 89, however (again, this has been exhaustively tested).
   // float4 result = as_float4(as_int4(xf*recip) + 0x89);
   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
   N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
   // Convert back to integer and return.
   // return vmovn_s32(vcvt_s32_f32(result));
   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
   return N0;
 }
 
 static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
          "unexpected type for custom-lowering ISD::SDIV");
 
   SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2, N3;
 
   if (VT == MVT::v8i8) {
     N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
     N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
 
     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                      DAG.getIntPtrConstant(4, dl));
     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
                      DAG.getIntPtrConstant(4, dl));
     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                      DAG.getIntPtrConstant(0, dl));
     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
                      DAG.getIntPtrConstant(0, dl));
 
     N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
     N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
 
     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
     N0 = LowerCONCAT_VECTORS(N0, DAG);
 
     N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
     return N0;
   }
   return LowerSDIV_v4i16(N0, N1, dl, DAG);
 }
 
 static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   // TODO: Should this propagate fast-math-flags?
   EVT VT = Op.getValueType();
   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
          "unexpected type for custom-lowering ISD::UDIV");
 
   SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2, N3;
 
   if (VT == MVT::v8i8) {
     N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
     N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
 
     N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                      DAG.getIntPtrConstant(4, dl));
     N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
                      DAG.getIntPtrConstant(4, dl));
     N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
                      DAG.getIntPtrConstant(0, dl));
     N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
                      DAG.getIntPtrConstant(0, dl));
 
     N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
     N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
 
     N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
     N0 = LowerCONCAT_VECTORS(N0, DAG);
 
     N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
                      DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
                                      MVT::i32),
                      N0);
     return N0;
   }
 
   // v4i16 sdiv ... Convert to float.
   // float4 yf = vcvt_f32_s32(vmovl_u16(y));
   // float4 xf = vcvt_f32_s32(vmovl_u16(x));
   N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
   N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
   N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
   SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
 
   // Use reciprocal estimate and two refinement steps.
   // float4 recip = vrecpeq_f32(yf);
   // recip *= vrecpsq_f32(yf, recip);
   // recip *= vrecpsq_f32(yf, recip);
   N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
                    BN1);
   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
                    BN1, N2);
   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
   N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
                    DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
                    BN1, N2);
   N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
   // Simply multiplying by the reciprocal estimate can leave us a few ulps
   // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
   // and that it will never cause us to return an answer too large).
   // float4 result = as_float4(as_int4(xf*recip) + 2);
   N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
   N1 = DAG.getConstant(2, dl, MVT::v4i32);
   N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
   N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
   // Convert back to integer and return.
   // return vmovn_u32(vcvt_s32_f32(result));
   N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
   N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
   return N0;
 }
 
 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
   SDNode *N = Op.getNode();
   EVT VT = N->getValueType(0);
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
 
   SDValue Carry = Op.getOperand(2);
 
   SDLoc DL(Op);
 
   SDValue Result;
   if (Op.getOpcode() == ISD::ADDCARRY) {
     // This converts the boolean value carry into the carry flag.
     Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
 
     // Do the addition proper using the carry flag we wanted.
     Result = DAG.getNode(ARMISD::ADDE, DL, VTs, Op.getOperand(0),
                          Op.getOperand(1), Carry);
 
     // Now convert the carry flag into a boolean value.
     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
   } else {
     // ARMISD::SUBE expects a carry not a borrow like ISD::SUBCARRY so we
     // have to invert the carry first.
     Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
                         DAG.getConstant(1, DL, MVT::i32), Carry);
     // This converts the boolean value carry into the carry flag.
     Carry = ConvertBooleanCarryToCarryFlag(Carry, DAG);
 
     // Do the subtraction proper using the carry flag we wanted.
     Result = DAG.getNode(ARMISD::SUBE, DL, VTs, Op.getOperand(0),
                          Op.getOperand(1), Carry);
 
     // Now convert the carry flag into a boolean value.
     Carry = ConvertCarryFlagToBooleanCarry(Result.getValue(1), VT, DAG);
     // But the carry returned by ARMISD::SUBE is not a borrow as expected
     // by ISD::SUBCARRY, so compute 1 - C.
     Carry = DAG.getNode(ISD::SUB, DL, MVT::i32,
                         DAG.getConstant(1, DL, MVT::i32), Carry);
   }
 
   // Return both values.
   return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Result, Carry);
 }
 
 SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->isTargetDarwin());
 
   // For iOS, we want to call an alternative entry point: __sincos_stret,
   // return values are passed via sret.
   SDLoc dl(Op);
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Pair of floats / doubles used to pass the result.
   Type *RetTy = StructType::get(ArgTy, ArgTy);
   auto &DL = DAG.getDataLayout();
 
   ArgListTy Args;
   bool ShouldUseSRet = Subtarget->isAPCS_ABI();
   SDValue SRet;
   if (ShouldUseSRet) {
     // Create stack object for sret.
     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
     const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
 
     ArgListEntry Entry;
     Entry.Node = SRet;
     Entry.Ty = RetTy->getPointerTo();
     Entry.IsSExt = false;
     Entry.IsZExt = false;
     Entry.IsSRet = true;
     Args.push_back(Entry);
     RetTy = Type::getVoidTy(*DAG.getContext());
   }
 
   ArgListEntry Entry;
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
   Entry.IsSExt = false;
   Entry.IsZExt = false;
   Args.push_back(Entry);
 
   RTLIB::Libcall LC =
       (ArgVT == MVT::f64) ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
   const char *LibcallName = getLibcallName(LC);
   CallingConv::ID CC = getLibcallCallingConv(LC);
   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DL));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
       .setCallee(CC, RetTy, Callee, std::move(Args))
       .setDiscardResult(ShouldUseSRet);
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   if (!ShouldUseSRet)
     return CallResult.first;
 
   SDValue LoadSin =
       DAG.getLoad(ArgVT, dl, CallResult.second, SRet, MachinePointerInfo());
 
   // Address of cos field.
   SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, SRet,
                             DAG.getIntPtrConstant(ArgVT.getStoreSize(), dl));
   SDValue LoadCos =
       DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add, MachinePointerInfo());
 
   SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
   return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
                      LoadSin.getValue(0), LoadCos.getValue(0));
 }
 
 SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
                                                   bool Signed,
                                                   SDValue &Chain) const {
   EVT VT = Op.getValueType();
   assert((VT == MVT::i32 || VT == MVT::i64) &&
          "unexpected type for custom lowering DIV");
   SDLoc dl(Op);
 
   const auto &DL = DAG.getDataLayout();
   const auto &TLI = DAG.getTargetLoweringInfo();
 
   const char *Name = nullptr;
   if (Signed)
     Name = (VT == MVT::i32) ? "__rt_sdiv" : "__rt_sdiv64";
   else
     Name = (VT == MVT::i32) ? "__rt_udiv" : "__rt_udiv64";
 
   SDValue ES = DAG.getExternalSymbol(Name, TLI.getPointerTy(DL));
 
   ARMTargetLowering::ArgListTy Args;
 
   for (auto AI : {1, 0}) {
     ArgListEntry Arg;
     Arg.Node = Op.getOperand(AI);
     Arg.Ty = Arg.Node.getValueType().getTypeForEVT(*DAG.getContext());
     Args.push_back(Arg);
   }
 
   CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
     .setChain(Chain)
     .setCallee(CallingConv::ARM_AAPCS_VFP, VT.getTypeForEVT(*DAG.getContext()),
                ES, std::move(Args));
 
   return LowerCallTo(CLI).first;
 }
 
 SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
                                             bool Signed) const {
   assert(Op.getValueType() == MVT::i32 &&
          "unexpected type for custom lowering DIV");
   SDLoc dl(Op);
 
   SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
                                DAG.getEntryNode(), Op.getOperand(1));
 
   return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
 }
 
 static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain) {
   SDLoc DL(N);
   SDValue Op = N->getOperand(1);
   if (N->getValueType(0) == MVT::i32)
     return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
                            DAG.getConstant(0, DL, MVT::i32));
   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, Op,
                            DAG.getConstant(1, DL, MVT::i32));
   return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
                      DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
 }
 
 void ARMTargetLowering::ExpandDIV_Windows(
     SDValue Op, SelectionDAG &DAG, bool Signed,
     SmallVectorImpl<SDValue> &Results) const {
   const auto &DL = DAG.getDataLayout();
   const auto &TLI = DAG.getTargetLoweringInfo();
 
   assert(Op.getValueType() == MVT::i64 &&
          "unexpected type for custom lowering DIV");
   SDLoc dl(Op);
 
   SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
 
   SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
 
   SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
   SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
 
   Results.push_back(Lower);
   Results.push_back(Upper);
 }
 
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
     // Acquire/Release load/store is not legal for targets without a dmb or
     // equivalent available.
     return SDValue();
 
   // Monotonic load/store is legal for all targets.
   return Op;
 }
 
 static void ReplaceREADCYCLECOUNTER(SDNode *N,
                                     SmallVectorImpl<SDValue> &Results,
                                     SelectionDAG &DAG,
                                     const ARMSubtarget *Subtarget) {
   SDLoc DL(N);
   // Under Power Management extensions, the cycle-count is:
   //    mrc p15, #0, <Rt>, c9, c13, #0
   SDValue Ops[] = { N->getOperand(0), // Chain
                     DAG.getConstant(Intrinsic::arm_mrc, DL, MVT::i32),
                     DAG.getConstant(15, DL, MVT::i32),
                     DAG.getConstant(0, DL, MVT::i32),
                     DAG.getConstant(9, DL, MVT::i32),
                     DAG.getConstant(13, DL, MVT::i32),
                     DAG.getConstant(0, DL, MVT::i32)
   };
 
   SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
                                  DAG.getVTList(MVT::i32, MVT::Other), Ops);
   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
                                 DAG.getConstant(0, DL, MVT::i32)));
   Results.push_back(Cycles32.getValue(1));
 }
 
 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
   SDLoc dl(V.getNode());
   SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i32);
   SDValue VHi = DAG.getAnyExtOrTrunc(
       DAG.getNode(ISD::SRL, dl, MVT::i64, V, DAG.getConstant(32, dl, MVT::i32)),
       dl, MVT::i32);
   bool isBigEndian = DAG.getDataLayout().isBigEndian();
   if (isBigEndian)
     std::swap (VLo, VHi);
   SDValue RegClass =
       DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
   SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
   SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
   const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
   return SDValue(
       DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
 }
 
 static void ReplaceCMP_SWAP_64Results(SDNode *N,
                                        SmallVectorImpl<SDValue> & Results,
                                        SelectionDAG &DAG) {
   assert(N->getValueType(0) == MVT::i64 &&
          "AtomicCmpSwap on types less than 64 should be legal");
   SDValue Ops[] = {N->getOperand(1),
                    createGPRPairNode(DAG, N->getOperand(2)),
                    createGPRPairNode(DAG, N->getOperand(3)),
                    N->getOperand(0)};
   SDNode *CmpSwap = DAG.getMachineNode(
       ARM::CMP_SWAP_64, SDLoc(N),
       DAG.getVTList(MVT::Untyped, MVT::i32, MVT::Other), Ops);
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
 
   bool isBigEndian = DAG.getDataLayout().isBigEndian();
 
   Results.push_back(
       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
                                  SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
   Results.push_back(
       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
                                  SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
   Results.push_back(SDValue(CmpSwap, 2));
 }
 
 static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
                           SelectionDAG &DAG) {
   const auto &TLI = DAG.getTargetLoweringInfo();
 
   assert(Subtarget.getTargetTriple().isOSMSVCRT() &&
          "Custom lowering is MSVCRT specific!");
 
   SDLoc dl(Op);
   SDValue Val = Op.getOperand(0);
   MVT Ty = Val->getSimpleValueType(0);
   SDValue Exponent = DAG.getNode(ISD::SINT_TO_FP, dl, Ty, Op.getOperand(1));
   SDValue Callee = DAG.getExternalSymbol(Ty == MVT::f32 ? "powf" : "pow",
                                          TLI.getPointerTy(DAG.getDataLayout()));
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
   Entry.Node = Val;
   Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
   Entry.IsZExt = true;
   Args.push_back(Entry);
 
   Entry.Node = Exponent;
   Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
   Entry.IsZExt = true;
   Args.push_back(Entry);
 
   Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
 
   // In the in-chain to the call is the entry node  If we are emitting a
   // tailcall, the chain will be mutated if the node has a non-entry input
   // chain.
   SDValue InChain = DAG.getEntryNode();
   SDValue TCChain = InChain;
 
   const Function &F = DAG.getMachineFunction().getFunction();
   bool IsTC = TLI.isInTailCallPosition(DAG, Op.getNode(), TCChain) &&
               F.getReturnType() == LCRTy;
   if (IsTC)
     InChain = TCChain;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(InChain)
       .setCallee(CallingConv::ARM_AAPCS_VFP, LCRTy, Callee, std::move(Args))
       .setTailCall(IsTC);
   std::pair<SDValue, SDValue> CI = TLI.LowerCallTo(CLI);
 
   // Return the chain (the DAG root) if it is a tail call
   return !CI.second.getNode() ? DAG.getRoot() : CI.first;
 }
 
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
   case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
   case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
   case ISD::BRCOND:        return LowerBRCOND(Op, DAG);
   case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
   case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
   case ISD::VASTART:       return LowerVASTART(Op, DAG);
   case ISD::ATOMIC_FENCE:  return LowerATOMIC_FENCE(Op, DAG, Subtarget);
   case ISD::PREFETCH:      return LowerPREFETCH(Op, DAG, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
   case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
                                                                Subtarget);
   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
   case ISD::SHL:
   case ISD::SRL:
   case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
   case ISD::SREM:          return LowerREM(Op.getNode(), DAG);
   case ISD::UREM:          return LowerREM(Op.getNode(), DAG);
   case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
   case ISD::SRL_PARTS:
   case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
   case ISD::CTPOP:         return LowerCTPOP(Op.getNode(), DAG, Subtarget);
   case ISD::SETCC:         return LowerVSETCC(Op, DAG);
   case ISD::SETCCCARRY:    return LowerSETCCCARRY(Op, DAG);
   case ISD::ConstantFP:    return LowerConstantFP(Op, DAG, Subtarget);
   case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG, Subtarget);
   case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:           return LowerMUL(Op, DAG);
   case ISD::SDIV:
     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
     return LowerSDIV(Op, DAG);
   case ISD::UDIV:
     if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
     return LowerUDIV(Op, DAG);
   case ISD::ADDCARRY:
   case ISD::SUBCARRY:      return LowerADDSUBCARRY(Op, DAG);
   case ISD::SADDO:
   case ISD::SSUBO:
     return LowerSignedALUO(Op, DAG);
   case ISD::UADDO:
   case ISD::USUBO:
     return LowerUnsignedALUO(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
   case ISD::SDIVREM:
   case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     if (Subtarget->isTargetWindows())
       return LowerDYNAMIC_STACKALLOC(Op, DAG);
     llvm_unreachable("Don't know how to custom lower this!");
   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
   case ISD::FPOWI: return LowerFPOWI(Op, *Subtarget, DAG);
   case ARMISD::WIN__DBZCHK: return SDValue();
   }
 }
 
 static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                  SelectionDAG &DAG) {
   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   unsigned Opc = 0;
   if (IntNo == Intrinsic::arm_smlald)
     Opc = ARMISD::SMLALD;
   else if (IntNo == Intrinsic::arm_smlaldx)
     Opc = ARMISD::SMLALDX;
   else if (IntNo == Intrinsic::arm_smlsld)
     Opc = ARMISD::SMLSLD;
   else if (IntNo == Intrinsic::arm_smlsldx)
     Opc = ARMISD::SMLSLDX;
   else
     return;
 
   SDLoc dl(N);
   SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                            N->getOperand(3),
                            DAG.getConstant(0, dl, MVT::i32));
   SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                            N->getOperand(3),
                            DAG.getConstant(1, dl, MVT::i32));
 
   SDValue LongMul = DAG.getNode(Opc, dl,
                                 DAG.getVTList(MVT::i32, MVT::i32),
                                 N->getOperand(1), N->getOperand(2),
                                 Lo, Hi);
   Results.push_back(LongMul.getValue(0));
   Results.push_back(LongMul.getValue(1));
 }
 
 /// ReplaceNodeResults - Replace the results of node with an illegal result
 /// type with new values built out of custom code.
 void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue> &Results,
                                            SelectionDAG &DAG) const {
   SDValue Res;
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Don't know how to custom expand this!");
   case ISD::READ_REGISTER:
     ExpandREAD_REGISTER(N, Results, DAG);
     break;
   case ISD::BITCAST:
     Res = ExpandBITCAST(N, DAG, Subtarget);
     break;
   case ISD::SRL:
   case ISD::SRA:
     Res = Expand64BitShift(N, DAG, Subtarget);
     break;
   case ISD::SREM:
   case ISD::UREM:
     Res = LowerREM(N, DAG);
     break;
   case ISD::SDIVREM:
   case ISD::UDIVREM:
     Res = LowerDivRem(SDValue(N, 0), DAG);
     assert(Res.getNumOperands() == 2 && "DivRem needs two values");
     Results.push_back(Res.getValue(0));
     Results.push_back(Res.getValue(1));
     return;
   case ISD::READCYCLECOUNTER:
     ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
     return;
   case ISD::UDIV:
   case ISD::SDIV:
     assert(Subtarget->isTargetWindows() && "can only expand DIV on Windows");
     return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
                              Results);
   case ISD::ATOMIC_CMP_SWAP:
     ReplaceCMP_SWAP_64Results(N, Results, DAG);
     return;
   case ISD::INTRINSIC_WO_CHAIN:
     return ReplaceLongIntrinsic(N, Results, DAG);
   }
   if (Res.getNode())
     Results.push_back(Res);
 }
 
 //===----------------------------------------------------------------------===//
 //                           ARM Scheduler Hooks
 //===----------------------------------------------------------------------===//
 
 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
 /// registers the function context.
 void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
                                                MachineBasicBlock *MBB,
                                                MachineBasicBlock *DispatchBB,
                                                int FI) const {
   assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
          "ROPI/RWPI not currently supported with SjLj");
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   MachineConstantPool *MCP = MF->getConstantPool();
   ARMFunctionInfo *AFI = MF->getInfo<ARMFunctionInfo>();
   const Function &F = MF->getFunction();
 
   bool isThumb = Subtarget->isThumb();
   bool isThumb2 = Subtarget->isThumb2();
 
   unsigned PCLabelId = AFI->createPICLabelUId();
   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
   ARMConstantPoolValue *CPV =
     ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
   unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
 
   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
                                            : &ARM::GPRRegClass;
 
   // Grab constant pool and fixed stack memory operands.
   MachineMemOperand *CPMMO =
       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
                                MachineMemOperand::MOLoad, 4, 4);
 
   MachineMemOperand *FIMMOSt =
       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
                                MachineMemOperand::MOStore, 4, 4);
 
   // Load the address of the dispatch MBB into the jump buffer.
   if (isThumb2) {
     // Incoming value: jbuf
     //   ldr.n  r5, LCPI1_1
     //   orr    r5, r5, #1
     //   add    r5, pc
     //   str    r5, [$jbuf, #+4] ; &jbuf[1]
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
         .addConstantPoolIndex(CPI)
         .addMemOperand(CPMMO)
         .add(predOps(ARMCC::AL));
     // Set the low bit because of thumb mode.
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
         .addReg(NewVReg1, RegState::Kill)
         .addImm(0x01)
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
       .addReg(NewVReg2, RegState::Kill)
       .addImm(PCLabelId);
     BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
         .addReg(NewVReg3, RegState::Kill)
         .addFrameIndex(FI)
         .addImm(36) // &jbuf[1] :: pc
         .addMemOperand(FIMMOSt)
         .add(predOps(ARMCC::AL));
   } else if (isThumb) {
     // Incoming value: jbuf
     //   ldr.n  r1, LCPI1_4
     //   add    r1, pc
     //   mov    r2, #1
     //   orrs   r1, r2
     //   add    r2, $jbuf, #+4 ; &jbuf[1]
     //   str    r1, [r2]
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
         .addConstantPoolIndex(CPI)
         .addMemOperand(CPMMO)
         .add(predOps(ARMCC::AL));
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
       .addReg(NewVReg1, RegState::Kill)
       .addImm(PCLabelId);
     // Set the low bit because of thumb mode.
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
         .addReg(ARM::CPSR, RegState::Define)
         .addImm(1)
         .add(predOps(ARMCC::AL));
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
         .addReg(ARM::CPSR, RegState::Define)
         .addReg(NewVReg2, RegState::Kill)
         .addReg(NewVReg3, RegState::Kill)
         .add(predOps(ARMCC::AL));
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
             .addFrameIndex(FI)
             .addImm(36); // &jbuf[1] :: pc
     BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
         .addReg(NewVReg4, RegState::Kill)
         .addReg(NewVReg5, RegState::Kill)
         .addImm(0)
         .addMemOperand(FIMMOSt)
         .add(predOps(ARMCC::AL));
   } else {
     // Incoming value: jbuf
     //   ldr  r1, LCPI1_1
     //   add  r1, pc, r1
     //   str  r1, [$jbuf, #+4] ; &jbuf[1]
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
         .addConstantPoolIndex(CPI)
         .addImm(0)
         .addMemOperand(CPMMO)
         .add(predOps(ARMCC::AL));
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
     BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
         .addReg(NewVReg1, RegState::Kill)
         .addImm(PCLabelId)
         .add(predOps(ARMCC::AL));
     BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
         .addReg(NewVReg2, RegState::Kill)
         .addFrameIndex(FI)
         .addImm(36) // &jbuf[1] :: pc
         .addMemOperand(FIMMOSt)
         .add(predOps(ARMCC::AL));
   }
 }
 
 void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
                                               MachineBasicBlock *MBB) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   int FI = MFI.getFunctionContextIndex();
 
   const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
                                                         : &ARM::GPRnopcRegClass;
 
   // Get a mapping of the call site numbers to all of the landing pads they're
   // associated with.
   DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
   unsigned MaxCSNum = 0;
   for (MachineFunction::iterator BB = MF->begin(), E = MF->end(); BB != E;
        ++BB) {
     if (!BB->isEHPad()) continue;
 
     // FIXME: We should assert that the EH_LABEL is the first MI in the landing
     // pad.
     for (MachineBasicBlock::iterator
            II = BB->begin(), IE = BB->end(); II != IE; ++II) {
       if (!II->isEHLabel()) continue;
 
       MCSymbol *Sym = II->getOperand(0).getMCSymbol();
       if (!MF->hasCallSiteLandingPad(Sym)) continue;
 
       SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
       for (SmallVectorImpl<unsigned>::iterator
              CSI = CallSiteIdxs.begin(), CSE = CallSiteIdxs.end();
            CSI != CSE; ++CSI) {
         CallSiteNumToLPad[*CSI].push_back(&*BB);
         MaxCSNum = std::max(MaxCSNum, *CSI);
       }
       break;
     }
   }
 
   // Get an ordered list of the machine basic blocks for the jump table.
   std::vector<MachineBasicBlock*> LPadList;
   SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
   LPadList.reserve(CallSiteNumToLPad.size());
   for (unsigned I = 1; I <= MaxCSNum; ++I) {
     SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
     for (SmallVectorImpl<MachineBasicBlock*>::iterator
            II = MBBList.begin(), IE = MBBList.end(); II != IE; ++II) {
       LPadList.push_back(*II);
       InvokeBBs.insert((*II)->pred_begin(), (*II)->pred_end());
     }
   }
 
   assert(!LPadList.empty() &&
          "No landing pad destinations for the dispatch jump table!");
 
   // Create the jump table and associated information.
   MachineJumpTableInfo *JTI =
     MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
 
   // Create the MBBs for the dispatch code.
 
   // Shove the dispatch's address into the return slot in the function context.
   MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
   DispatchBB->setIsEHPad();
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   unsigned trap_opcode;
   if (Subtarget->isThumb())
     trap_opcode = ARM::tTRAP;
   else
     trap_opcode = Subtarget->useNaClTrap() ? ARM::TRAPNaCl : ARM::TRAP;
 
   BuildMI(TrapBB, dl, TII->get(trap_opcode));
   DispatchBB->addSuccessor(TrapBB);
 
   MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
   DispatchBB->addSuccessor(DispContBB);
 
   // Insert and MBBs.
   MF->insert(MF->end(), DispatchBB);
   MF->insert(MF->end(), DispContBB);
   MF->insert(MF->end(), TrapBB);
 
   // Insert code into the entry block that creates and registers the function
   // context.
   SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
 
   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FI),
       MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
 
   MachineInstrBuilder MIB;
   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
 
   const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
   const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
 
   // Add a register mask with no preserved registers.  This results in all
   // registers being marked as clobbered. This can't work if the dispatch block
   // is in a Thumb1 function and is linked with ARM code which uses the FP
   // registers, as there is no way to preserve the FP registers in Thumb1 mode.
   MIB.addRegMask(RI.getSjLjDispatchPreservedMask(*MF));
 
   bool IsPositionIndependent = isPositionIndependent();
   unsigned NumLPads = LPadList.size();
   if (Subtarget->isThumb2()) {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
         .addFrameIndex(FI)
         .addImm(4)
         .addMemOperand(FIMMOLd)
         .add(predOps(ARMCC::AL));
 
     if (NumLPads < 256) {
       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
           .addReg(NewVReg1)
           .addImm(LPadList.size())
           .add(predOps(ARMCC::AL));
     } else {
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
           .addImm(NumLPads & 0xFFFF)
           .add(predOps(ARMCC::AL));
 
       unsigned VReg2 = VReg1;
       if ((NumLPads & 0xFFFF0000) != 0) {
         VReg2 = MRI->createVirtualRegister(TRC);
         BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
             .addReg(VReg1)
             .addImm(NumLPads >> 16)
             .add(predOps(ARMCC::AL));
       }
 
       BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
           .addReg(NewVReg1)
           .addReg(VReg2)
           .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
       .addMBB(TrapBB)
       .addImm(ARMCC::HI)
       .addReg(ARM::CPSR);
 
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
         .addJumpTableIndex(MJTI)
         .add(predOps(ARMCC::AL));
 
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
         .addReg(NewVReg3, RegState::Kill)
         .addReg(NewVReg1)
         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
 
     BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
       .addReg(NewVReg4, RegState::Kill)
       .addReg(NewVReg1)
       .addJumpTableIndex(MJTI);
   } else if (Subtarget->isThumb()) {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
         .addFrameIndex(FI)
         .addImm(1)
         .addMemOperand(FIMMOLd)
         .add(predOps(ARMCC::AL));
 
     if (NumLPads < 256) {
       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
           .addReg(NewVReg1)
           .addImm(NumLPads)
           .add(predOps(ARMCC::AL));
     } else {
       MachineConstantPool *ConstantPool = MF->getConstantPool();
       Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
       if (Align == 0)
         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
           .addReg(VReg1, RegState::Define)
           .addConstantPoolIndex(Idx)
           .add(predOps(ARMCC::AL));
       BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
           .addReg(NewVReg1)
           .addReg(VReg1)
           .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
       .addMBB(TrapBB)
       .addImm(ARMCC::HI)
       .addReg(ARM::CPSR);
 
     unsigned NewVReg2 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
         .addReg(ARM::CPSR, RegState::Define)
         .addReg(NewVReg1)
         .addImm(2)
         .add(predOps(ARMCC::AL));
 
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
         .addJumpTableIndex(MJTI)
         .add(predOps(ARMCC::AL));
 
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
         .addReg(ARM::CPSR, RegState::Define)
         .addReg(NewVReg2, RegState::Kill)
         .addReg(NewVReg3)
         .add(predOps(ARMCC::AL));
 
     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
 
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
         .addReg(NewVReg4, RegState::Kill)
         .addImm(0)
         .addMemOperand(JTMMOLd)
         .add(predOps(ARMCC::AL));
 
     unsigned NewVReg6 = NewVReg5;
     if (IsPositionIndependent) {
       NewVReg6 = MRI->createVirtualRegister(TRC);
       BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
           .addReg(ARM::CPSR, RegState::Define)
           .addReg(NewVReg5, RegState::Kill)
           .addReg(NewVReg3)
           .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
       .addReg(NewVReg6, RegState::Kill)
       .addJumpTableIndex(MJTI);
   } else {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
     BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
         .addFrameIndex(FI)
         .addImm(4)
         .addMemOperand(FIMMOLd)
         .add(predOps(ARMCC::AL));
 
     if (NumLPads < 256) {
       BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
           .addReg(NewVReg1)
           .addImm(NumLPads)
           .add(predOps(ARMCC::AL));
     } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
           .addImm(NumLPads & 0xFFFF)
           .add(predOps(ARMCC::AL));
 
       unsigned VReg2 = VReg1;
       if ((NumLPads & 0xFFFF0000) != 0) {
         VReg2 = MRI->createVirtualRegister(TRC);
         BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
             .addReg(VReg1)
             .addImm(NumLPads >> 16)
             .add(predOps(ARMCC::AL));
       }
 
       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
           .addReg(NewVReg1)
           .addReg(VReg2)
           .add(predOps(ARMCC::AL));
     } else {
       MachineConstantPool *ConstantPool = MF->getConstantPool();
       Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
       unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
       if (Align == 0)
         Align = MF->getDataLayout().getTypeAllocSize(C->getType());
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
           .addReg(VReg1, RegState::Define)
           .addConstantPoolIndex(Idx)
           .addImm(0)
           .add(predOps(ARMCC::AL));
       BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
           .addReg(NewVReg1)
           .addReg(VReg1, RegState::Kill)
           .add(predOps(ARMCC::AL));
     }
 
     BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
       .addMBB(TrapBB)
       .addImm(ARMCC::HI)
       .addReg(ARM::CPSR);
 
     unsigned NewVReg3 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
         .addReg(NewVReg1)
         .addImm(ARM_AM::getSORegOpc(ARM_AM::lsl, 2))
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
     unsigned NewVReg4 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
         .addJumpTableIndex(MJTI)
         .add(predOps(ARMCC::AL));
 
     MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
         MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
     unsigned NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
         .addReg(NewVReg3, RegState::Kill)
         .addReg(NewVReg4)
         .addImm(0)
         .addMemOperand(JTMMOLd)
         .add(predOps(ARMCC::AL));
 
     if (IsPositionIndependent) {
       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
         .addReg(NewVReg5, RegState::Kill)
         .addReg(NewVReg4)
         .addJumpTableIndex(MJTI);
     } else {
       BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
         .addReg(NewVReg5, RegState::Kill)
         .addJumpTableIndex(MJTI);
     }
   }
 
   // Add the jump table entries as successors to the MBB.
   SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
   for (std::vector<MachineBasicBlock*>::iterator
          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
     MachineBasicBlock *CurMBB = *I;
     if (SeenMBBs.insert(CurMBB).second)
       DispContBB->addSuccessor(CurMBB);
   }
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
   const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
   SmallVector<MachineBasicBlock*, 64> MBBLPads;
   for (MachineBasicBlock *BB : InvokeBBs) {
 
     // Remove the landing pad successor from the invoke block and replace it
     // with the new dispatch block.
     SmallVector<MachineBasicBlock*, 4> Successors(BB->succ_begin(),
                                                   BB->succ_end());
     while (!Successors.empty()) {
       MachineBasicBlock *SMBB = Successors.pop_back_val();
       if (SMBB->isEHPad()) {
         BB->removeSuccessor(SMBB);
         MBBLPads.push_back(SMBB);
       }
     }
 
     BB->addSuccessor(DispatchBB, BranchProbability::getZero());
     BB->normalizeSuccProbs();
 
     // Find the invoke call and mark all of the callee-saved registers as
     // 'implicit defined' so that they're spilled. This prevents code from
     // moving instructions to before the EH block, where they will never be
     // executed.
     for (MachineBasicBlock::reverse_iterator
            II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
       if (!II->isCall()) continue;
 
       DenseMap<unsigned, bool> DefRegs;
       for (MachineInstr::mop_iterator
              OI = II->operands_begin(), OE = II->operands_end();
            OI != OE; ++OI) {
         if (!OI->isReg()) continue;
         DefRegs[OI->getReg()] = true;
       }
 
       MachineInstrBuilder MIB(*MF, &*II);
 
       for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
         unsigned Reg = SavedRegs[i];
         if (Subtarget->isThumb2() &&
             !ARM::tGPRRegClass.contains(Reg) &&
             !ARM::hGPRRegClass.contains(Reg))
           continue;
         if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
           continue;
         if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
           continue;
         if (!DefRegs[Reg])
           MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
       }
 
       break;
     }
   }
 
   // Mark all former landing pads as non-landing pads. The dispatch is the only
   // landing pad now.
   for (SmallVectorImpl<MachineBasicBlock*>::iterator
          I = MBBLPads.begin(), E = MBBLPads.end(); I != E; ++I)
     (*I)->setIsEHPad(false);
 
   // The instruction is gone now.
   MI.eraseFromParent();
 }
 
 static
 MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I)
     if (*I != Succ)
       return *I;
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
 /// Return the load opcode for a given load size. If load size >= 8,
 /// neon opcode will be returned.
 static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
   if (LdSize >= 8)
     return LdSize == 16 ? ARM::VLD1q32wb_fixed
                         : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
   if (IsThumb1)
     return LdSize == 4 ? ARM::tLDRi
                        : LdSize == 2 ? ARM::tLDRHi
                                      : LdSize == 1 ? ARM::tLDRBi : 0;
   if (IsThumb2)
     return LdSize == 4 ? ARM::t2LDR_POST
                        : LdSize == 2 ? ARM::t2LDRH_POST
                                      : LdSize == 1 ? ARM::t2LDRB_POST : 0;
   return LdSize == 4 ? ARM::LDR_POST_IMM
                      : LdSize == 2 ? ARM::LDRH_POST
                                    : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
 }
 
 /// Return the store opcode for a given store size. If store size >= 8,
 /// neon opcode will be returned.
 static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
   if (StSize >= 8)
     return StSize == 16 ? ARM::VST1q32wb_fixed
                         : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
   if (IsThumb1)
     return StSize == 4 ? ARM::tSTRi
                        : StSize == 2 ? ARM::tSTRHi
                                      : StSize == 1 ? ARM::tSTRBi : 0;
   if (IsThumb2)
     return StSize == 4 ? ARM::t2STR_POST
                        : StSize == 2 ? ARM::t2STRH_POST
                                      : StSize == 1 ? ARM::t2STRB_POST : 0;
   return StSize == 4 ? ARM::STR_POST_IMM
                      : StSize == 2 ? ARM::STRH_POST
                                    : StSize == 1 ? ARM::STRB_POST_IMM : 0;
 }
 
 /// Emit a post-increment load operation with given size. The instructions
 /// will be added to BB at Pos.
 static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
                        const TargetInstrInfo *TII, const DebugLoc &dl,
                        unsigned LdSize, unsigned Data, unsigned AddrIn,
                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
   unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
   assert(LdOpc != 0 && "Should have a load opcode");
   if (LdSize >= 8) {
     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
         .addReg(AddrOut, RegState::Define)
         .addReg(AddrIn)
         .addImm(0)
         .add(predOps(ARMCC::AL));
   } else if (IsThumb1) {
     // load + update AddrIn
     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
         .addReg(AddrIn)
         .addImm(0)
         .add(predOps(ARMCC::AL));
     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
         .add(t1CondCodeOp())
         .addReg(AddrIn)
         .addImm(LdSize)
         .add(predOps(ARMCC::AL));
   } else if (IsThumb2) {
     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
         .addReg(AddrOut, RegState::Define)
         .addReg(AddrIn)
         .addImm(LdSize)
         .add(predOps(ARMCC::AL));
   } else { // arm
     BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
         .addReg(AddrOut, RegState::Define)
         .addReg(AddrIn)
         .addReg(0)
         .addImm(LdSize)
         .add(predOps(ARMCC::AL));
   }
 }
 
 /// Emit a post-increment store operation with given size. The instructions
 /// will be added to BB at Pos.
 static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos,
                        const TargetInstrInfo *TII, const DebugLoc &dl,
                        unsigned StSize, unsigned Data, unsigned AddrIn,
                        unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
   unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
   assert(StOpc != 0 && "Should have a store opcode");
   if (StSize >= 8) {
     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
         .addReg(AddrIn)
         .addImm(0)
         .addReg(Data)
         .add(predOps(ARMCC::AL));
   } else if (IsThumb1) {
     // store + update AddrIn
     BuildMI(*BB, Pos, dl, TII->get(StOpc))
         .addReg(Data)
         .addReg(AddrIn)
         .addImm(0)
         .add(predOps(ARMCC::AL));
     BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
         .add(t1CondCodeOp())
         .addReg(AddrIn)
         .addImm(StSize)
         .add(predOps(ARMCC::AL));
   } else if (IsThumb2) {
     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
         .addReg(Data)
         .addReg(AddrIn)
         .addImm(StSize)
         .add(predOps(ARMCC::AL));
   } else { // arm
     BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
         .addReg(Data)
         .addReg(AddrIn)
         .addReg(0)
         .addImm(StSize)
         .add(predOps(ARMCC::AL));
   }
 }
 
 MachineBasicBlock *
 ARMTargetLowering::EmitStructByval(MachineInstr &MI,
                                    MachineBasicBlock *BB) const {
   // This pseudo instruction has 3 operands: dst, src, size
   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
   // Otherwise, we will generate unrolled scalar copies.
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = ++BB->getIterator();
 
   unsigned dest = MI.getOperand(0).getReg();
   unsigned src = MI.getOperand(1).getReg();
   unsigned SizeVal = MI.getOperand(2).getImm();
   unsigned Align = MI.getOperand(3).getImm();
   DebugLoc dl = MI.getDebugLoc();
 
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned UnitSize = 0;
   const TargetRegisterClass *TRC = nullptr;
   const TargetRegisterClass *VecTRC = nullptr;
 
   bool IsThumb1 = Subtarget->isThumb1Only();
   bool IsThumb2 = Subtarget->isThumb2();
   bool IsThumb = Subtarget->isThumb();
 
   if (Align & 1) {
     UnitSize = 1;
   } else if (Align & 2) {
     UnitSize = 2;
   } else {
     // Check whether we can use NEON instructions.
     if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
         Subtarget->hasNEON()) {
       if ((Align % 16 == 0) && SizeVal >= 16)
         UnitSize = 16;
       else if ((Align % 8 == 0) && SizeVal >= 8)
         UnitSize = 8;
     }
     // Can't use NEON instructions.
     if (UnitSize == 0)
       UnitSize = 4;
   }
 
   // Select the correct opcode and register class for unit size load/store
   bool IsNeon = UnitSize >= 8;
   TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
   if (IsNeon)
     VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
                             : UnitSize == 8 ? &ARM::DPRRegClass
                                             : nullptr;
 
   unsigned BytesLeft = SizeVal % UnitSize;
   unsigned LoopSize = SizeVal - BytesLeft;
 
   if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
     // Use LDR and STR to copy.
     // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
     // [destOut] = STR_POST(scratch, destIn, UnitSize)
     unsigned srcIn = src;
     unsigned destIn = dest;
     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
       unsigned srcOut = MRI.createVirtualRegister(TRC);
       unsigned destOut = MRI.createVirtualRegister(TRC);
       unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
       emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
                  IsThumb1, IsThumb2);
       emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
                  IsThumb1, IsThumb2);
       srcIn = srcOut;
       destIn = destOut;
     }
 
     // Handle the leftover bytes with LDRB and STRB.
     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
     // [destOut] = STRB_POST(scratch, destIn, 1)
     for (unsigned i = 0; i < BytesLeft; i++) {
       unsigned srcOut = MRI.createVirtualRegister(TRC);
       unsigned destOut = MRI.createVirtualRegister(TRC);
       unsigned scratch = MRI.createVirtualRegister(TRC);
       emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
                  IsThumb1, IsThumb2);
       emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
                  IsThumb1, IsThumb2);
       srcIn = srcOut;
       destIn = destOut;
     }
     MI.eraseFromParent(); // The instruction is gone now.
     return BB;
   }
 
   // Expand the pseudo op to a loop.
   // thisMBB:
   //   ...
   //   movw varEnd, # --> with thumb2
   //   movt varEnd, #
   //   ldrcp varEnd, idx --> without thumb2
   //   fallthrough --> loopMBB
   // loopMBB:
   //   PHI varPhi, varEnd, varLoop
   //   PHI srcPhi, src, srcLoop
   //   PHI destPhi, dst, destLoop
   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
   //   [destLoop] = STR_POST(scratch, destPhi, UnitSize)
   //   subs varLoop, varPhi, #UnitSize
   //   bne loopMBB
   //   fallthrough --> exitMBB
   // exitMBB:
   //   epilogue to handle left-over bytes
   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
   //   [destOut] = STRB_POST(scratch, destLoop, 1)
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MF->insert(It, loopMBB);
   MF->insert(It, exitMBB);
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
                   std::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   // Load an immediate to varEnd.
   unsigned varEnd = MRI.createVirtualRegister(TRC);
   if (Subtarget->useMovt(*MF)) {
     unsigned Vtmp = varEnd;
     if ((LoopSize & 0xFFFF0000) != 0)
       Vtmp = MRI.createVirtualRegister(TRC);
     BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi16 : ARM::MOVi16), Vtmp)
         .addImm(LoopSize & 0xFFFF)
         .add(predOps(ARMCC::AL));
 
     if ((LoopSize & 0xFFFF0000) != 0)
       BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVTi16 : ARM::MOVTi16), varEnd)
           .addReg(Vtmp)
           .addImm(LoopSize >> 16)
           .add(predOps(ARMCC::AL));
   } else {
     MachineConstantPool *ConstantPool = MF->getConstantPool();
     Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
 
     // MachineConstantPool wants an explicit alignment.
     unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
     if (Align == 0)
       Align = MF->getDataLayout().getTypeAllocSize(C->getType());
     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
     if (IsThumb)
       BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
           .addReg(varEnd, RegState::Define)
           .addConstantPoolIndex(Idx)
           .add(predOps(ARMCC::AL));
     else
       BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
           .addReg(varEnd, RegState::Define)
           .addConstantPoolIndex(Idx)
           .addImm(0)
           .add(predOps(ARMCC::AL));
   }
   BB->addSuccessor(loopMBB);
 
   // Generate the loop body:
   //   varPhi = PHI(varLoop, varEnd)
   //   srcPhi = PHI(srcLoop, src)
   //   destPhi = PHI(destLoop, dst)
   MachineBasicBlock *entryBB = BB;
   BB = loopMBB;
   unsigned varLoop = MRI.createVirtualRegister(TRC);
   unsigned varPhi = MRI.createVirtualRegister(TRC);
   unsigned srcLoop = MRI.createVirtualRegister(TRC);
   unsigned srcPhi = MRI.createVirtualRegister(TRC);
   unsigned destLoop = MRI.createVirtualRegister(TRC);
   unsigned destPhi = MRI.createVirtualRegister(TRC);
 
   BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
     .addReg(varLoop).addMBB(loopMBB)
     .addReg(varEnd).addMBB(entryBB);
   BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
     .addReg(srcLoop).addMBB(loopMBB)
     .addReg(src).addMBB(entryBB);
   BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
     .addReg(destLoop).addMBB(loopMBB)
     .addReg(dest).addMBB(entryBB);
 
   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
   unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
   emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
              IsThumb1, IsThumb2);
   emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
              IsThumb1, IsThumb2);
 
   // Decrement loop variable by UnitSize.
   if (IsThumb1) {
     BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
         .add(t1CondCodeOp())
         .addReg(varPhi)
         .addImm(UnitSize)
         .add(predOps(ARMCC::AL));
   } else {
     MachineInstrBuilder MIB =
         BuildMI(*BB, BB->end(), dl,
                 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
     MIB.addReg(varPhi)
         .addImm(UnitSize)
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
     MIB->getOperand(5).setReg(ARM::CPSR);
     MIB->getOperand(5).setIsDef(true);
   }
   BuildMI(*BB, BB->end(), dl,
           TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
       .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
 
   // loopMBB can loop back to loopMBB or fall through to exitMBB.
   BB->addSuccessor(loopMBB);
   BB->addSuccessor(exitMBB);
 
   // Add epilogue to handle BytesLeft.
   BB = exitMBB;
   auto StartOfExit = exitMBB->begin();
 
   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
   //   [destOut] = STRB_POST(scratch, destLoop, 1)
   unsigned srcIn = srcLoop;
   unsigned destIn = destLoop;
   for (unsigned i = 0; i < BytesLeft; i++) {
     unsigned srcOut = MRI.createVirtualRegister(TRC);
     unsigned destOut = MRI.createVirtualRegister(TRC);
     unsigned scratch = MRI.createVirtualRegister(TRC);
     emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
                IsThumb1, IsThumb2);
     emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
                IsThumb1, IsThumb2);
     srcIn = srcOut;
     destIn = destOut;
   }
 
   MI.eraseFromParent(); // The instruction is gone now.
   return BB;
 }
 
 MachineBasicBlock *
 ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
                                        MachineBasicBlock *MBB) const {
   const TargetMachine &TM = getTargetMachine();
   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
   assert(Subtarget->isTargetWindows() &&
          "__chkstk is only supported on Windows");
   assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
 
   // __chkstk takes the number of words to allocate on the stack in R4, and
   // returns the stack adjustment in number of bytes in R4.  This will not
   // clober any other registers (other than the obvious lr).
   //
   // Although, technically, IP should be considered a register which may be
   // clobbered, the call itself will not touch it.  Windows on ARM is a pure
   // thumb-2 environment, so there is no interworking required.  As a result, we
   // do not expect a veneer to be emitted by the linker, clobbering IP.
   //
   // Each module receives its own copy of __chkstk, so no import thunk is
   // required, again, ensuring that IP is not clobbered.
   //
   // Finally, although some linkers may theoretically provide a trampoline for
   // out of range calls (which is quite common due to a 32M range limitation of
   // branches for Thumb), we can generate the long-call version via
   // -mcmodel=large, alleviating the need for the trampoline which may clobber
   // IP.
 
   switch (TM.getCodeModel()) {
   case CodeModel::Small:
   case CodeModel::Medium:
   case CodeModel::Kernel:
     BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
         .add(predOps(ARMCC::AL))
         .addExternalSymbol("__chkstk")
         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
         .addReg(ARM::R12,
                 RegState::Implicit | RegState::Define | RegState::Dead)
         .addReg(ARM::CPSR,
                 RegState::Implicit | RegState::Define | RegState::Dead);
     break;
   case CodeModel::Large: {
     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
     unsigned Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
 
     BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
       .addExternalSymbol("__chkstk");
     BuildMI(*MBB, MI, DL, TII.get(ARM::tBLXr))
         .add(predOps(ARMCC::AL))
         .addReg(Reg, RegState::Kill)
         .addReg(ARM::R4, RegState::Implicit | RegState::Kill)
         .addReg(ARM::R4, RegState::Implicit | RegState::Define)
         .addReg(ARM::R12,
                 RegState::Implicit | RegState::Define | RegState::Dead)
         .addReg(ARM::CPSR,
                 RegState::Implicit | RegState::Define | RegState::Dead);
     break;
   }
   }
 
   BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
       .addReg(ARM::SP, RegState::Kill)
       .addReg(ARM::R4, RegState::Kill)
       .setMIFlags(MachineInstr::FrameSetup)
       .add(predOps(ARMCC::AL))
       .add(condCodeOp());
 
   MI.eraseFromParent();
   return MBB;
 }
 
 MachineBasicBlock *
 ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
                                        MachineBasicBlock *MBB) const {
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
 
   MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
   MF->insert(++MBB->getIterator(), ContBB);
   ContBB->splice(ContBB->begin(), MBB,
                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
   ContBB->transferSuccessorsAndUpdatePHIs(MBB);
   MBB->addSuccessor(ContBB);
 
   MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
   BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
   MF->push_back(TrapBB);
   MBB->addSuccessor(TrapBB);
 
   BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
       .addReg(MI.getOperand(0).getReg())
       .addImm(0)
       .add(predOps(ARMCC::AL));
   BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
       .addMBB(TrapBB)
       .addImm(ARMCC::EQ)
       .addReg(ARM::CPSR);
 
   MI.eraseFromParent();
   return ContBB;
 }
 
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
   switch (MI.getOpcode()) {
   default: {
     MI.print(errs());
     llvm_unreachable("Unexpected instr type to insert");
   }
 
   // Thumb1 post-indexed loads are really just single-register LDMs.
   case ARM::tLDR_postidx: {
     MachineOperand Def(MI.getOperand(1));
     BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
         .add(Def)  // Rn_wb
         .add(MI.getOperand(2))  // Rn
         .add(MI.getOperand(3))  // PredImm
         .add(MI.getOperand(4))  // PredReg
         .add(MI.getOperand(0)); // Rt
     MI.eraseFromParent();
     return BB;
   }
 
   // The Thumb2 pre-indexed stores have the same MI operands, they just
   // define them differently in the .td files from the isel patterns, so
   // they need pseudos.
   case ARM::t2STR_preidx:
     MI.setDesc(TII->get(ARM::t2STR_PRE));
     return BB;
   case ARM::t2STRB_preidx:
     MI.setDesc(TII->get(ARM::t2STRB_PRE));
     return BB;
   case ARM::t2STRH_preidx:
     MI.setDesc(TII->get(ARM::t2STRH_PRE));
     return BB;
 
   case ARM::STRi_preidx:
   case ARM::STRBi_preidx: {
     unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
                                                          : ARM::STRB_PRE_IMM;
     // Decode the offset.
     unsigned Offset = MI.getOperand(4).getImm();
     bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
     Offset = ARM_AM::getAM2Offset(Offset);
     if (isSub)
       Offset = -Offset;
 
     MachineMemOperand *MMO = *MI.memoperands_begin();
     BuildMI(*BB, MI, dl, TII->get(NewOpc))
         .add(MI.getOperand(0)) // Rn_wb
         .add(MI.getOperand(1)) // Rt
         .add(MI.getOperand(2)) // Rn
         .addImm(Offset)        // offset (skip GPR==zero_reg)
         .add(MI.getOperand(5)) // pred
         .add(MI.getOperand(6))
         .addMemOperand(MMO);
     MI.eraseFromParent();
     return BB;
   }
   case ARM::STRr_preidx:
   case ARM::STRBr_preidx:
   case ARM::STRH_preidx: {
     unsigned NewOpc;
     switch (MI.getOpcode()) {
     default: llvm_unreachable("unexpected opcode!");
     case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
     case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
     case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
     }
     MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
     for (unsigned i = 0; i < MI.getNumOperands(); ++i)
       MIB.add(MI.getOperand(i));
     MI.eraseFromParent();
     return BB;
   }
 
   case ARM::tMOVCCr_pseudo: {
     // To "insert" a SELECT_CC instruction, we actually have to insert the
     // diamond control-flow pattern.  The incoming instruction knows the
     // destination vreg to set, the condition code register to branch on, the
     // true/false values to select between, and a branch opcode to use.
     const BasicBlock *LLVM_BB = BB->getBasicBlock();
     MachineFunction::iterator It = ++BB->getIterator();
 
     //  thisMBB:
     //  ...
     //   TrueVal = ...
     //   cmpTY ccX, r1, r2
     //   bCC copy1MBB
     //   fallthrough --> copy0MBB
     MachineBasicBlock *thisMBB  = BB;
     MachineFunction *F = BB->getParent();
     MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
     F->insert(It, copy0MBB);
     F->insert(It, sinkMBB);
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     sinkMBB->splice(sinkMBB->begin(), BB,
                     std::next(MachineBasicBlock::iterator(MI)), BB->end());
     sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
 
     BB->addSuccessor(copy0MBB);
     BB->addSuccessor(sinkMBB);
 
     BuildMI(BB, dl, TII->get(ARM::tBcc))
         .addMBB(sinkMBB)
         .addImm(MI.getOperand(3).getImm())
         .addReg(MI.getOperand(4).getReg());
 
     //  copy0MBB:
     //   %FalseValue = ...
     //   # fallthrough to sinkMBB
     BB = copy0MBB;
 
     // Update machine-CFG edges
     BB->addSuccessor(sinkMBB);
 
     //  sinkMBB:
     //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
     //  ...
     BB = sinkMBB;
     BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
         .addReg(MI.getOperand(1).getReg())
         .addMBB(copy0MBB)
         .addReg(MI.getOperand(2).getReg())
         .addMBB(thisMBB);
 
     MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
 
   case ARM::BCCi64:
   case ARM::BCCZi64: {
     // If there is an unconditional branch to the other successor, remove it.
     BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
 
     // Compare both parts that make up the double comparison separately for
     // equality.
     bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
 
     unsigned LHS1 = MI.getOperand(1).getReg();
     unsigned LHS2 = MI.getOperand(2).getReg();
     if (RHSisZero) {
       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
           .addReg(LHS1)
           .addImm(0)
           .add(predOps(ARMCC::AL));
       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
         .addReg(LHS2).addImm(0)
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
     } else {
       unsigned RHS1 = MI.getOperand(3).getReg();
       unsigned RHS2 = MI.getOperand(4).getReg();
       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
           .addReg(LHS1)
           .addReg(RHS1)
           .add(predOps(ARMCC::AL));
       BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
         .addReg(LHS2).addReg(RHS2)
         .addImm(ARMCC::EQ).addReg(ARM::CPSR);
     }
 
     MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
     MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
     if (MI.getOperand(0).getImm() == ARMCC::NE)
       std::swap(destMBB, exitMBB);
 
     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
       .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
     if (isThumb2)
       BuildMI(BB, dl, TII->get(ARM::t2B))
           .addMBB(exitMBB)
           .add(predOps(ARMCC::AL));
     else
       BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
 
     MI.eraseFromParent(); // The pseudo instruction is gone now.
     return BB;
   }
 
   case ARM::Int_eh_sjlj_setjmp:
   case ARM::Int_eh_sjlj_setjmp_nofp:
   case ARM::tInt_eh_sjlj_setjmp:
   case ARM::t2Int_eh_sjlj_setjmp:
   case ARM::t2Int_eh_sjlj_setjmp_nofp:
     return BB;
 
   case ARM::Int_eh_sjlj_setup_dispatch:
     EmitSjLjDispatchBlock(MI, BB);
     return BB;
 
   case ARM::ABS:
   case ARM::t2ABS: {
     // To insert an ABS instruction, we have to insert the
     // diamond control-flow pattern.  The incoming instruction knows the
     // source vreg to test against 0, the destination vreg to set,
     // the condition code register to branch on, the
     // true/false values to select between, and a branch opcode to use.
     // It transforms
     //     V1 = ABS V0
     // into
     //     V2 = MOVS V0
     //     BCC                      (branch to SinkBB if V0 >= 0)
     //     RSBBB: V3 = RSBri V2, 0  (compute ABS if V2 < 0)
     //     SinkBB: V1 = PHI(V2, V3)
     const BasicBlock *LLVM_BB = BB->getBasicBlock();
     MachineFunction::iterator BBI = ++BB->getIterator();
     MachineFunction *Fn = BB->getParent();
     MachineBasicBlock *RSBBB = Fn->CreateMachineBasicBlock(LLVM_BB);
     MachineBasicBlock *SinkBB  = Fn->CreateMachineBasicBlock(LLVM_BB);
     Fn->insert(BBI, RSBBB);
     Fn->insert(BBI, SinkBB);
 
     unsigned int ABSSrcReg = MI.getOperand(1).getReg();
     unsigned int ABSDstReg = MI.getOperand(0).getReg();
     bool ABSSrcKIll = MI.getOperand(1).isKill();
     bool isThumb2 = Subtarget->isThumb2();
     MachineRegisterInfo &MRI = Fn->getRegInfo();
     // In Thumb mode S must not be specified if source register is the SP or
     // PC and if destination register is the SP, so restrict register class
     unsigned NewRsbDstReg =
       MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass);
 
     // Transfer the remainder of BB and its successor edges to sinkMBB.
     SinkBB->splice(SinkBB->begin(), BB,
                    std::next(MachineBasicBlock::iterator(MI)), BB->end());
     SinkBB->transferSuccessorsAndUpdatePHIs(BB);
 
     BB->addSuccessor(RSBBB);
     BB->addSuccessor(SinkBB);
 
     // fall through to SinkMBB
     RSBBB->addSuccessor(SinkBB);
 
     // insert a cmp at the end of BB
     BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
         .addReg(ABSSrcReg)
         .addImm(0)
         .add(predOps(ARMCC::AL));
 
     // insert a bcc with opposite CC to ARMCC::MI at the end of BB
     BuildMI(BB, dl,
       TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc)).addMBB(SinkBB)
       .addImm(ARMCC::getOppositeCondition(ARMCC::MI)).addReg(ARM::CPSR);
 
     // insert rsbri in RSBBB
     // Note: BCC and rsbri will be converted into predicated rsbmi
     // by if-conversion pass
     BuildMI(*RSBBB, RSBBB->begin(), dl,
             TII->get(isThumb2 ? ARM::t2RSBri : ARM::RSBri), NewRsbDstReg)
         .addReg(ABSSrcReg, ABSSrcKIll ? RegState::Kill : 0)
         .addImm(0)
         .add(predOps(ARMCC::AL))
         .add(condCodeOp());
 
     // insert PHI in SinkBB,
     // reuse ABSDstReg to not change uses of ABS instruction
     BuildMI(*SinkBB, SinkBB->begin(), dl,
       TII->get(ARM::PHI), ABSDstReg)
       .addReg(NewRsbDstReg).addMBB(RSBBB)
       .addReg(ABSSrcReg).addMBB(BB);
 
     // remove ABS instruction
     MI.eraseFromParent();
 
     // return last added BB
     return SinkBB;
   }
   case ARM::COPY_STRUCT_BYVAL_I32:
     ++NumLoopByVals;
     return EmitStructByval(MI, BB);
   case ARM::WIN__CHKSTK:
     return EmitLowered__chkstk(MI, BB);
   case ARM::WIN__DBZCHK:
     return EmitLowered__dbzchk(MI, BB);
   }
 }
 
 /// Attaches vregs to MEMCPY that it will use as scratch registers
 /// when it is expanded into LDM/STM. This is done as a post-isel lowering
 /// instead of as a custom inserter because we need the use list from the SDNode.
 static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
                                     MachineInstr &MI, const SDNode *Node) {
   bool isThumb1 = Subtarget->isThumb1Only();
 
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MI.getParent()->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineInstrBuilder MIB(*MF, MI);
 
   // If the new dst/src is unused mark it as dead.
   if (!Node->hasAnyUseOfValue(0)) {
     MI.getOperand(0).setIsDead(true);
   }
   if (!Node->hasAnyUseOfValue(1)) {
     MI.getOperand(1).setIsDead(true);
   }
 
   // The MEMCPY both defines and kills the scratch registers.
   for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
     unsigned TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
                                                          : &ARM::GPRRegClass);
     MIB.addReg(TmpReg, RegState::Define|RegState::Dead);
   }
 }
 
 void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
                                                       SDNode *Node) const {
   if (MI.getOpcode() == ARM::MEMCPY) {
     attachMEMCPYScratchRegs(Subtarget, MI, Node);
     return;
   }
 
   const MCInstrDesc *MCID = &MI.getDesc();
   // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
   // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
   // operand is still set to noreg. If needed, set the optional operand's
   // register to CPSR, and remove the redundant implicit def.
   //
   // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
 
   // Rename pseudo opcodes.
   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
   unsigned ccOutIdx;
   if (NewOpc) {
     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
     MCID = &TII->get(NewOpc);
 
     assert(MCID->getNumOperands() ==
            MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
         && "converted opcode should be the same except for cc_out"
            " (and, on Thumb1, pred)");
 
     MI.setDesc(*MCID);
 
     // Add the optional cc_out operand
     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
 
     // On Thumb1, move all input operands to the end, then add the predicate
     if (Subtarget->isThumb1Only()) {
       for (unsigned c = MCID->getNumOperands() - 4; c--;) {
         MI.addOperand(MI.getOperand(1));
         MI.RemoveOperand(1);
       }
 
       // Restore the ties
       for (unsigned i = MI.getNumOperands(); i--;) {
         const MachineOperand& op = MI.getOperand(i);
         if (op.isReg() && op.isUse()) {
           int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
           if (DefIdx != -1)
             MI.tieOperands(DefIdx, i);
         }
       }
 
       MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
       MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
       ccOutIdx = 1;
     } else
       ccOutIdx = MCID->getNumOperands() - 1;
   } else
     ccOutIdx = MCID->getNumOperands() - 1;
 
   // Any ARM instruction that sets the 's' bit should specify an optional
   // "cc_out" operand in the last operand position.
   if (!MI.hasOptionalDef() || !MCID->OpInfo[ccOutIdx].isOptionalDef()) {
     assert(!NewOpc && "Optional cc_out operand required");
     return;
   }
   // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
   // since we already have an optional CPSR def.
   bool definesCPSR = false;
   bool deadCPSR = false;
   for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
        ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
       definesCPSR = true;
       if (MO.isDead())
         deadCPSR = true;
       MI.RemoveOperand(i);
       break;
     }
   }
   if (!definesCPSR) {
     assert(!NewOpc && "Optional cc_out operand required");
     return;
   }
   assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
   if (deadCPSR) {
     assert(!MI.getOperand(ccOutIdx).getReg() &&
            "expect uninitialized optional cc_out operand");
     // Thumb1 instructions must have the S bit even if the CPSR is dead.
     if (!Subtarget->isThumb1Only())
       return;
   }
 
   // If this instruction was defined with an optional CPSR def and its dag node
   // had a live implicit CPSR def, then activate the optional CPSR def.
   MachineOperand &MO = MI.getOperand(ccOutIdx);
   MO.setReg(ARM::CPSR);
   MO.setIsDef(true);
 }
 
 //===----------------------------------------------------------------------===//
 //                           ARM Optimization Hooks
 //===----------------------------------------------------------------------===//
 
 // Helper function that checks if N is a null or all ones constant.
 static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
   return AllOnes ? isAllOnesConstant(N) : isNullConstant(N);
 }
 
 // Return true if N is conditionally 0 or all ones.
 // Detects these expressions where cc is an i1 value:
 //
 //   (select cc 0, y)   [AllOnes=0]
 //   (select cc y, 0)   [AllOnes=0]
 //   (zext cc)          [AllOnes=0]
 //   (sext cc)          [AllOnes=0/1]
 //   (select cc -1, y)  [AllOnes=1]
 //   (select cc y, -1)  [AllOnes=1]
 //
 // Invert is set when N is the null/all ones constant when CC is false.
 // OtherOp is set to the alternative value of N.
 static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
                                        SDValue &CC, bool &Invert,
                                        SDValue &OtherOp,
                                        SelectionDAG &DAG) {
   switch (N->getOpcode()) {
   default: return false;
   case ISD::SELECT: {
     CC = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDValue N2 = N->getOperand(2);
     if (isZeroOrAllOnes(N1, AllOnes)) {
       Invert = false;
       OtherOp = N2;
       return true;
     }
     if (isZeroOrAllOnes(N2, AllOnes)) {
       Invert = true;
       OtherOp = N1;
       return true;
     }
     return false;
   }
   case ISD::ZERO_EXTEND:
     // (zext cc) can never be the all ones value.
     if (AllOnes)
       return false;
     LLVM_FALLTHROUGH;
   case ISD::SIGN_EXTEND: {
     SDLoc dl(N);
     EVT VT = N->getValueType(0);
     CC = N->getOperand(0);
     if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
       return false;
     Invert = !AllOnes;
     if (AllOnes)
       // When looking for an AllOnes constant, N is an sext, and the 'other'
       // value is 0.
       OtherOp = DAG.getConstant(0, dl, VT);
     else if (N->getOpcode() == ISD::ZERO_EXTEND)
       // When looking for a 0 constant, N can be zext or sext.
       OtherOp = DAG.getConstant(1, dl, VT);
     else
       OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
                                 VT);
     return true;
   }
   }
 }
 
 // Combine a constant select operand into its use:
 //
 //   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
 //   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
 //   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
 //   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
 //   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
 //
 // The transform is rejected if the select doesn't have a constant operand that
 // is null, or all ones when AllOnes is set.
 //
 // Also recognize sext/zext from i1:
 //
 //   (add (zext cc), x) -> (select cc (add x, 1), x)
 //   (add (sext cc), x) -> (select cc (add x, -1), x)
 //
 // These transformations eventually create predicated instructions.
 //
 // @param N       The node to transform.
 // @param Slct    The N operand that is a select.
 // @param OtherOp The other N operand (x above).
 // @param DCI     Context.
 // @param AllOnes Require the select constant to be all ones instead of null.
 // @returns The new node, or SDValue() on failure.
 static
 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
                             TargetLowering::DAGCombinerInfo &DCI,
                             bool AllOnes = false) {
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
   SDValue NonConstantVal;
   SDValue CCOp;
   bool SwapSelectOps;
   if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
                                   NonConstantVal, DAG))
     return SDValue();
 
   // Slct is now know to be the desired identity constant when CC is true.
   SDValue TrueVal = OtherOp;
   SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                  OtherOp, NonConstantVal);
   // Unless SwapSelectOps says CC should be false.
   if (SwapSelectOps)
     std::swap(TrueVal, FalseVal);
 
   return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
                      CCOp, TrueVal, FalseVal);
 }
 
 // Attempt combineSelectAndUse on each operand of a commutative operator N.
 static
 SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
                                        TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   if (N0.getNode()->hasOneUse())
     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
       return Result;
   if (N1.getNode()->hasOneUse())
     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
       return Result;
   return SDValue();
 }
 
 static bool IsVUZPShuffleNode(SDNode *N) {
   // VUZP shuffle node.
   if (N->getOpcode() == ARMISD::VUZP)
     return true;
 
   // "VUZP" on i32 is an alias for VTRN.
   if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
     return true;
 
   return false;
 }
 
 static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
   // Look for ADD(VUZP.0, VUZP.1).
   if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
       N0 == N1)
    return SDValue();
 
   // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
   if (!N->getValueType(0).is64BitVector())
     return SDValue();
 
   // Generate vpadd.
   SelectionDAG &DAG = DCI.DAG;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc dl(N);
   SDNode *Unzip = N0.getNode();
   EVT VT = N->getValueType(0);
 
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
                                 TLI.getPointerTy(DAG.getDataLayout())));
   Ops.push_back(Unzip->getOperand(0));
   Ops.push_back(Unzip->getOperand(1));
 
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
 }
 
 static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const ARMSubtarget *Subtarget) {
   // Check for two extended operands.
   if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
         N1.getOpcode() == ISD::SIGN_EXTEND) &&
       !(N0.getOpcode() == ISD::ZERO_EXTEND &&
         N1.getOpcode() == ISD::ZERO_EXTEND))
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
   SDValue N10 = N1.getOperand(0);
 
   // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
   if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
       N00 == N10)
     return SDValue();
 
   // We only recognize Q register paddl here; this can't be reached until
   // after type legalization.
   if (!N00.getValueType().is64BitVector() ||
       !N0.getValueType().is128BitVector())
     return SDValue();
 
   // Generate vpaddl.
   SelectionDAG &DAG = DCI.DAG;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   SmallVector<SDValue, 8> Ops;
   // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
   unsigned Opcode;
   if (N0.getOpcode() == ISD::SIGN_EXTEND)
     Opcode = Intrinsic::arm_neon_vpaddls;
   else
     Opcode = Intrinsic::arm_neon_vpaddlu;
   Ops.push_back(DAG.getConstant(Opcode, dl,
                                 TLI.getPointerTy(DAG.getDataLayout())));
   EVT ElemTy = N00.getValueType().getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
   EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
   SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
                                N00.getOperand(0), N00.getOperand(1));
   Ops.push_back(Concat);
 
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
 }
 
 // FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
 // an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
 // much easier to match.
 static SDValue
 AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1,
                                TargetLowering::DAGCombinerInfo &DCI,
                                const ARMSubtarget *Subtarget) {
   // Only perform optimization if after legalize, and if NEON is available. We
   // also expected both operands to be BUILD_VECTORs.
   if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
       || N0.getOpcode() != ISD::BUILD_VECTOR
       || N1.getOpcode() != ISD::BUILD_VECTOR)
     return SDValue();
 
   // Check output type since VPADDL operand elements can only be 8, 16, or 32.
   EVT VT = N->getValueType(0);
   if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
     return SDValue();
 
   // Check that the vector operands are of the right form.
   // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
   // operands, where N is the size of the formed vector.
   // Each EXTRACT_VECTOR should have the same input vector and odd or even
   // index such that we have a pair wise add pattern.
 
   // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
   if (N0->getOperand(0)->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
     return SDValue();
   SDValue Vec = N0->getOperand(0)->getOperand(0);
   SDNode *V = Vec.getNode();
   unsigned nextIndex = 0;
 
   // For each operands to the ADD which are BUILD_VECTORs,
   // check to see if each of their operands are an EXTRACT_VECTOR with
   // the same vector and appropriate index.
   for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
     if (N0->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT
         && N1->getOperand(i)->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
 
       SDValue ExtVec0 = N0->getOperand(i);
       SDValue ExtVec1 = N1->getOperand(i);
 
       // First operand is the vector, verify its the same.
       if (V != ExtVec0->getOperand(0).getNode() ||
           V != ExtVec1->getOperand(0).getNode())
         return SDValue();
 
       // Second is the constant, verify its correct.
       ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(ExtVec0->getOperand(1));
       ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(ExtVec1->getOperand(1));
 
       // For the constant, we want to see all the even or all the odd.
       if (!C0 || !C1 || C0->getZExtValue() != nextIndex
           || C1->getZExtValue() != nextIndex+1)
         return SDValue();
 
       // Increment index.
       nextIndex+=2;
     } else
       return SDValue();
   }
 
   // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
   // we're using the entire input vector, otherwise there's a size/legality
   // mismatch somewhere.
   if (nextIndex != Vec.getValueType().getVectorNumElements() ||
       Vec.getValueType().getVectorElementType() == VT.getVectorElementType())
     return SDValue();
 
   // Create VPADDL node.
   SelectionDAG &DAG = DCI.DAG;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   SDLoc dl(N);
 
   // Build operand list.
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
                                 TLI.getPointerTy(DAG.getDataLayout())));
 
   // Input is the vector.
   Ops.push_back(Vec);
 
   // Get widened type and narrowed type.
   MVT widenType;
   unsigned numElem = VT.getVectorNumElements();
 
   EVT inputLaneType = Vec.getValueType().getVectorElementType();
   switch (inputLaneType.getSimpleVT().SimpleTy) {
     case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
     case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
     case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
     default:
       llvm_unreachable("Invalid vector element type for padd optimization.");
   }
 
   SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
   return DAG.getNode(ExtOp, dl, VT, tmp);
 }
 
 static SDValue findMUL_LOHI(SDValue V) {
   if (V->getOpcode() == ISD::UMUL_LOHI ||
       V->getOpcode() == ISD::SMUL_LOHI)
     return V;
   return SDValue();
 }
 
 static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const ARMSubtarget *Subtarget) {
   if (Subtarget->isThumb()) {
     if (!Subtarget->hasDSP())
       return SDValue();
   } else if (!Subtarget->hasV5TEOps())
     return SDValue();
 
   // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
   // accumulates the product into a 64-bit value. The 16-bit values will
   // be sign extended somehow or SRA'd into 32-bit values
   // (addc (adde (mul 16bit, 16bit), lo), hi)
   SDValue Mul = AddcNode->getOperand(0);
   SDValue Lo = AddcNode->getOperand(1);
   if (Mul.getOpcode() != ISD::MUL) {
     Lo = AddcNode->getOperand(0);
     Mul = AddcNode->getOperand(1);
     if (Mul.getOpcode() != ISD::MUL)
       return SDValue();
   }
 
   SDValue SRA = AddeNode->getOperand(0);
   SDValue Hi = AddeNode->getOperand(1);
   if (SRA.getOpcode() != ISD::SRA) {
     SRA = AddeNode->getOperand(1);
     Hi = AddeNode->getOperand(0);
     if (SRA.getOpcode() != ISD::SRA)
       return SDValue();
   }
   if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
     if (Const->getZExtValue() != 31)
       return SDValue();
   } else
     return SDValue();
 
   if (SRA.getOperand(0) != Mul)
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(AddcNode);
   unsigned Opcode = 0;
   SDValue Op0;
   SDValue Op1;
 
   if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
     Opcode = ARMISD::SMLALBB;
     Op0 = Mul.getOperand(0);
     Op1 = Mul.getOperand(1);
   } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
     Opcode = ARMISD::SMLALBT;
     Op0 = Mul.getOperand(0);
     Op1 = Mul.getOperand(1).getOperand(0);
   } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
     Opcode = ARMISD::SMLALTB;
     Op0 = Mul.getOperand(0).getOperand(0);
     Op1 = Mul.getOperand(1);
   } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
     Opcode = ARMISD::SMLALTT;
     Op0 = Mul->getOperand(0).getOperand(0);
     Op1 = Mul->getOperand(1).getOperand(0);
   }
 
   if (!Op0 || !Op1)
     return SDValue();
 
   SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
                               Op0, Op1, Lo, Hi);
   // Replace the ADDs' nodes uses by the MLA node's values.
   SDValue HiMLALResult(SMLAL.getNode(), 1);
   SDValue LoMLALResult(SMLAL.getNode(), 0);
 
   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
 
   // Return original node to notify the driver to stop replacing.
   SDValue resNode(AddcNode, 0);
   return resNode;
 }
 
 static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
   // Look for multiply add opportunities.
   // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
   // each add nodes consumes a value from ISD::UMUL_LOHI and there is
   // a glue link from the first add to the second add.
   // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
   // a S/UMLAL instruction.
   //                  UMUL_LOHI
   //                 / :lo    \ :hi
   //                V          \          [no multiline comment]
   //    loAdd ->  ADDC         |
   //                 \ :carry /
   //                  V      V
   //                    ADDE   <- hiAdd
   //
   // In the special case where only the higher part of a signed result is used
   // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
   // a constant with the exact value of 0x80000000, we recognize we are dealing
   // with a "rounded multiply and add" (or subtract) and transform it into
   // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
 
   assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
           AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
          "Expect an ADDE or SUBE");
 
   assert(AddeSubeNode->getNumOperands() == 3 &&
          AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
          "ADDE node has the wrong inputs");
 
   // Check that we are chained to the right ADDC or SUBC node.
   SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
   if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
        AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
       (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
        AddcSubcNode->getOpcode() != ARMISD::SUBC))
     return SDValue();
 
   SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
   SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
 
   // Check if the two operands are from the same mul_lohi node.
   if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
     return SDValue();
 
   assert(AddcSubcNode->getNumValues() == 2 &&
          AddcSubcNode->getValueType(0) == MVT::i32 &&
          "Expect ADDC with two result values. First: i32");
 
   // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
   // maybe a SMLAL which multiplies two 16-bit values.
   if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
       AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
       AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
       AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
       AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
     return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
 
   // Check for the triangle shape.
   SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
   SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
 
   // Make sure that the ADDE/SUBE operands are not coming from the same node.
   if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
     return SDValue();
 
   // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
   bool IsLeftOperandMUL = false;
   SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
   if (MULOp == SDValue())
     MULOp = findMUL_LOHI(AddeSubeOp1);
   else
     IsLeftOperandMUL = true;
   if (MULOp == SDValue())
     return SDValue();
 
   // Figure out the right opcode.
   unsigned Opc = MULOp->getOpcode();
   unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
 
   // Figure out the high and low input values to the MLAL node.
   SDValue *HiAddSub = nullptr;
   SDValue *LoMul = nullptr;
   SDValue *LowAddSub = nullptr;
 
   // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
   if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
     return SDValue();
 
   if (IsLeftOperandMUL)
     HiAddSub = &AddeSubeOp1;
   else
     HiAddSub = &AddeSubeOp0;
 
   // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
   // whose low result is fed to the ADDC/SUBC we are checking.
 
   if (AddcSubcOp0 == MULOp.getValue(0)) {
     LoMul = &AddcSubcOp0;
     LowAddSub = &AddcSubcOp1;
   }
   if (AddcSubcOp1 == MULOp.getValue(0)) {
     LoMul = &AddcSubcOp1;
     LowAddSub = &AddcSubcOp0;
   }
 
   if (!LoMul)
     return SDValue();
 
   // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
   // the replacement below will create a cycle.
   if (AddcSubcNode == HiAddSub->getNode() ||
       AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
     return SDValue();
 
   // Create the merged node.
   SelectionDAG &DAG = DCI.DAG;
 
   // Start building operand list.
   SmallVector<SDValue, 8> Ops;
   Ops.push_back(LoMul->getOperand(0));
   Ops.push_back(LoMul->getOperand(1));
 
   // Check whether we can use SMMLAR, SMMLSR or SMMULR instead.  For this to be
   // the case, we must be doing signed multiplication and only use the higher
   // part of the result of the MLAL, furthermore the LowAddSub must be a constant
   // addition or subtraction with the value of 0x800000.
   if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
       FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
       LowAddSub->getNode()->getOpcode() == ISD::Constant &&
       static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
           0x80000000) {
     Ops.push_back(*HiAddSub);
     if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
       FinalOpc = ARMISD::SMMLSR;
     } else {
       FinalOpc = ARMISD::SMMLAR;
     }
     SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
 
     return SDValue(AddeSubeNode, 0);
   } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
     // SMMLS is generated during instruction selection and the rest of this
     // function can not handle the case where AddcSubcNode is a SUBC.
     return SDValue();
 
   // Finish building the operand list for {U/S}MLAL
   Ops.push_back(*LowAddSub);
   Ops.push_back(*HiAddSub);
 
   SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
 
   // Replace the ADDs' nodes uses by the MLA node's values.
   SDValue HiMLALResult(MLALNode.getNode(), 1);
   DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
 
   SDValue LoMLALResult(MLALNode.getNode(), 0);
   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
 
   // Return original node to notify the driver to stop replacing.
   return SDValue(AddeSubeNode, 0);
 }
 
 static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const ARMSubtarget *Subtarget) {
   // UMAAL is similar to UMLAL except that it adds two unsigned values.
   // While trying to combine for the other MLAL nodes, first search for the
   // chance to use UMAAL. Check if Addc uses a node which has already
   // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
   // as the addend, and it's handled in PerformUMLALCombine.
 
   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
     return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
 
   // Check that we have a glued ADDC node.
   SDNode* AddcNode = AddeNode->getOperand(2).getNode();
   if (AddcNode->getOpcode() != ARMISD::ADDC)
     return SDValue();
 
   // Find the converted UMAAL or quit if it doesn't exist.
   SDNode *UmlalNode = nullptr;
   SDValue AddHi;
   if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
     UmlalNode = AddcNode->getOperand(0).getNode();
     AddHi = AddcNode->getOperand(1);
   } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
     UmlalNode = AddcNode->getOperand(1).getNode();
     AddHi = AddcNode->getOperand(0);
   } else {
     return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
   }
 
   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
   // the ADDC as well as Zero.
   if (!isNullConstant(UmlalNode->getOperand(3)))
     return SDValue();
 
   if ((isNullConstant(AddeNode->getOperand(0)) &&
        AddeNode->getOperand(1).getNode() == UmlalNode) ||
       (AddeNode->getOperand(0).getNode() == UmlalNode &&
        isNullConstant(AddeNode->getOperand(1)))) {
     SelectionDAG &DAG = DCI.DAG;
     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
                       UmlalNode->getOperand(2), AddHi };
     SDValue UMAAL =  DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
                                  DAG.getVTList(MVT::i32, MVT::i32), Ops);
 
     // Replace the ADDs' nodes uses by the UMAAL node's values.
     DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
 
     // Return original node to notify the driver to stop replacing.
     return SDValue(AddeNode, 0);
   }
   return SDValue();
 }
 
 static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
                                    const ARMSubtarget *Subtarget) {
   if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
     return SDValue();
 
   // Check that we have a pair of ADDC and ADDE as operands.
   // Both addends of the ADDE must be zero.
   SDNode* AddcNode = N->getOperand(2).getNode();
   SDNode* AddeNode = N->getOperand(3).getNode();
   if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
       (AddeNode->getOpcode() == ARMISD::ADDE) &&
       isNullConstant(AddeNode->getOperand(0)) &&
       isNullConstant(AddeNode->getOperand(1)) &&
       (AddeNode->getOperand(2).getNode() == AddcNode))
     return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
                        DAG.getVTList(MVT::i32, MVT::i32),
                        {N->getOperand(0), N->getOperand(1),
                         AddcNode->getOperand(0), AddcNode->getOperand(1)});
   else
     return SDValue();
 }
 
 static SDValue PerformAddcSubcCombine(SDNode *N,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const ARMSubtarget *Subtarget) {
   SelectionDAG &DAG(DCI.DAG);
 
   if (N->getOpcode() == ARMISD::SUBC) {
     // (SUBC (ADDE 0, 0, C), 1) -> C
     SDValue LHS = N->getOperand(0);
     SDValue RHS = N->getOperand(1);
     if (LHS->getOpcode() == ARMISD::ADDE &&
         isNullConstant(LHS->getOperand(0)) &&
         isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
       return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
     }
   }
 
   if (Subtarget->isThumb1Only()) {
     SDValue RHS = N->getOperand(1);
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
       int32_t imm = C->getSExtValue();
       if (imm < 0 && imm > std::numeric_limits<int>::min()) {
         SDLoc DL(N);
         RHS = DAG.getConstant(-imm, DL, MVT::i32);
         unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
                                                            : ARMISD::ADDC;
         return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
       }
     }
   }
 
   return SDValue();
 }
 
 static SDValue PerformAddeSubeCombine(SDNode *N,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const ARMSubtarget *Subtarget) {
   if (Subtarget->isThumb1Only()) {
     SelectionDAG &DAG = DCI.DAG;
     SDValue RHS = N->getOperand(1);
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
       int64_t imm = C->getSExtValue();
       if (imm < 0) {
         SDLoc DL(N);
 
         // The with-carry-in form matches bitwise not instead of the negation.
         // Effectively, the inverse interpretation of the carry flag already
         // accounts for part of the negation.
         RHS = DAG.getConstant(~imm, DL, MVT::i32);
 
         unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
                                                            : ARMISD::ADDE;
         return DAG.getNode(Opcode, DL, N->getVTList(),
                            N->getOperand(0), RHS, N->getOperand(2));
       }
     }
   } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
     return AddCombineTo64bitMLAL(N, DCI, Subtarget);
   }
   return SDValue();
 }
 
 /// PerformADDECombine - Target-specific dag combine transform from
 /// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
 /// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
 static SDValue PerformADDECombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
   // Only ARM and Thumb2 support UMLAL/SMLAL.
   if (Subtarget->isThumb1Only())
     return PerformAddeSubeCombine(N, DCI, Subtarget);
 
   // Only perform the checks after legalize when the pattern is available.
   if (DCI.isBeforeLegalize()) return SDValue();
 
   return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
 }
 
 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
 /// called with the default operands, and if that fails, with commuted
 /// operands.
 static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           const ARMSubtarget *Subtarget){
   // Attempt to create vpadd for this add.
   if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
     return Result;
 
   // Attempt to create vpaddl for this add.
   if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
     return Result;
   if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
                                                       Subtarget))
     return Result;
 
   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
   if (N0.getNode()->hasOneUse())
     if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
       return Result;
   return SDValue();
 }
 
+bool
+ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
+                                                 CombineLevel Level) const {
+  if (Level == BeforeLegalizeTypes)
+    return true;
+
+  if (Subtarget->isThumb() && Subtarget->isThumb1Only())
+    return true;
+
+  if (N->getOpcode() != ISD::SHL)
+    return true;
+
+  // Turn off commute-with-shift transform after legalization, so it doesn't
+  // conflict with PerformSHLSimplify.  (We could try to detect when
+  // PerformSHLSimplify would trigger more precisely, but it isn't
+  // really necessary.)
+  return false;
+}
+
 static SDValue PerformSHLSimplify(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const ARMSubtarget *ST) {
   // Allow the generic combiner to identify potential bswaps.
   if (DCI.isBeforeLegalize())
     return SDValue();
 
   // DAG combiner will fold:
   // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
   // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
   // Other code patterns that can be also be modified have the following form:
   // b + ((a << 1) | 510)
   // b + ((a << 1) & 510)
   // b + ((a << 1) ^ 510)
   // b + ((a << 1) + 510)
 
   // Many instructions can  perform the shift for free, but it requires both
   // the operands to be registers. If c1 << c2 is too large, a mov immediate
   // instruction will needed. So, unfold back to the original pattern if:
   // - if c1 and c2 are small enough that they don't require mov imms.
   // - the user(s) of the node can perform an shl
 
   // No shifted operands for 16-bit instructions.
   if (ST->isThumb() && ST->isThumb1Only())
     return SDValue();
 
   // Check that all the users could perform the shl themselves.
   for (auto U : N->uses()) {
     switch(U->getOpcode()) {
     default:
       return SDValue();
     case ISD::SUB:
     case ISD::ADD:
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
     case ISD::SETCC:
     case ARMISD::CMP:
       // Check that the user isn't already using a constant because there
       // aren't any instructions that support an immediate operand and a
       // shifted operand.
       if (isa<ConstantSDNode>(U->getOperand(0)) ||
           isa<ConstantSDNode>(U->getOperand(1)))
         return SDValue();
 
       // Check that it's not already using a shift.
       if (U->getOperand(0).getOpcode() == ISD::SHL ||
           U->getOperand(1).getOpcode() == ISD::SHL)
         return SDValue();
       break;
     }
   }
 
   if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
       N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
     return SDValue();
 
   if (N->getOperand(0).getOpcode() != ISD::SHL)
     return SDValue();
 
   SDValue SHL = N->getOperand(0);
 
   auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
   auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
   if (!C1ShlC2 || !C2)
     return SDValue();
 
   APInt C2Int = C2->getAPIntValue();
   APInt C1Int = C1ShlC2->getAPIntValue();
 
   // Check that performing a lshr will not lose any information.
   APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
                                      C2Int.getBitWidth() - C2->getZExtValue());
   if ((C1Int & Mask) != C1Int)
     return SDValue();
 
   // Shift the first constant.
   C1Int.lshrInPlace(C2Int);
 
   // The immediates are encoded as an 8-bit value that can be rotated.
   auto LargeImm = [](const APInt &Imm) {
     unsigned Zeros = Imm.countLeadingZeros() + Imm.countTrailingZeros();
     return Imm.getBitWidth() - Zeros > 8;
   };
 
   if (LargeImm(C1Int) || LargeImm(C2Int))
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   SDValue X = SHL.getOperand(0);
   SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
                               DAG.getConstant(C1Int, dl, MVT::i32));
   // Shift left to compensate for the lshr of C1Int.
   SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
 
   LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
              SHL.dump(); N->dump());
   LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
-
-  DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
-  return SDValue(N, 0);
+  return Res;
 }
 
 
 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
 ///
 static SDValue PerformADDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // Only works one way, because it needs an immediate operand.
   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
     return Result;
 
   // First try with the default operand order.
   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
     return Result;
 
   // If that didn't work, try again with the operands commuted.
   return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
 }
 
 /// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
 ///
 static SDValue PerformSUBCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
   if (N1.getNode()->hasOneUse())
     if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
       return Result;
 
   return SDValue();
 }
 
 /// PerformVMULCombine
 /// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
 /// special multiplier accumulator forwarding.
 ///   vmul d3, d0, d2
 ///   vmla d3, d1, d2
 /// is faster than
 ///   vadd d3, d0, d1
 ///   vmul d3, d3, d2
 //  However, for (A + B) * (A + B),
 //    vadd d2, d0, d1
 //    vmul d3, d0, d2
 //    vmla d3, d1, d2
 //  is slower than
 //    vadd d2, d0, d1
 //    vmul d3, d2, d2
 static SDValue PerformVMULCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
   if (!Subtarget->hasVMLxForwarding())
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   unsigned Opcode = N0.getOpcode();
   if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
       Opcode != ISD::FADD && Opcode != ISD::FSUB) {
     Opcode = N1.getOpcode();
     if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
         Opcode != ISD::FADD && Opcode != ISD::FSUB)
       return SDValue();
     std::swap(N0, N1);
   }
 
   if (N0 == N1)
     return SDValue();
 
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
   SDValue N00 = N0->getOperand(0);
   SDValue N01 = N0->getOperand(1);
   return DAG.getNode(Opcode, DL, VT,
                      DAG.getNode(ISD::MUL, DL, VT, N00, N1),
                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
 }
 
 static SDValue PerformMULCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
 
   if (Subtarget->isThumb1Only())
     return SDValue();
 
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
   EVT VT = N->getValueType(0);
   if (VT.is64BitVector() || VT.is128BitVector())
     return PerformVMULCombine(N, DCI, Subtarget);
   if (VT != MVT::i32)
     return SDValue();
 
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!C)
     return SDValue();
 
   int64_t MulAmt = C->getSExtValue();
   unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
 
   ShiftAmt = ShiftAmt & (32 - 1);
   SDValue V = N->getOperand(0);
   SDLoc DL(N);
 
   SDValue Res;
   MulAmt >>= ShiftAmt;
 
   if (MulAmt >= 0) {
     if (isPowerOf2_32(MulAmt - 1)) {
       // (mul x, 2^N + 1) => (add (shl x, N), x)
       Res = DAG.getNode(ISD::ADD, DL, VT,
                         V,
                         DAG.getNode(ISD::SHL, DL, VT,
                                     V,
                                     DAG.getConstant(Log2_32(MulAmt - 1), DL,
                                                     MVT::i32)));
     } else if (isPowerOf2_32(MulAmt + 1)) {
       // (mul x, 2^N - 1) => (sub (shl x, N), x)
       Res = DAG.getNode(ISD::SUB, DL, VT,
                         DAG.getNode(ISD::SHL, DL, VT,
                                     V,
                                     DAG.getConstant(Log2_32(MulAmt + 1), DL,
                                                     MVT::i32)),
                         V);
     } else
       return SDValue();
   } else {
     uint64_t MulAmtAbs = -MulAmt;
     if (isPowerOf2_32(MulAmtAbs + 1)) {
       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
       Res = DAG.getNode(ISD::SUB, DL, VT,
                         V,
                         DAG.getNode(ISD::SHL, DL, VT,
                                     V,
                                     DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
                                                     MVT::i32)));
     } else if (isPowerOf2_32(MulAmtAbs - 1)) {
       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
       Res = DAG.getNode(ISD::ADD, DL, VT,
                         V,
                         DAG.getNode(ISD::SHL, DL, VT,
                                     V,
                                     DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
                                                     MVT::i32)));
       Res = DAG.getNode(ISD::SUB, DL, VT,
                         DAG.getConstant(0, DL, MVT::i32), Res);
     } else
       return SDValue();
   }
 
   if (ShiftAmt != 0)
     Res = DAG.getNode(ISD::SHL, DL, VT,
                       Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
 
   // Do not add new nodes to DAG combiner worklist.
   DCI.CombineTo(N, Res, false);
   return SDValue();
 }
 
 static SDValue CombineANDShift(SDNode *N,
                                TargetLowering::DAGCombinerInfo &DCI,
                                const ARMSubtarget *Subtarget) {
   // Allow DAGCombine to pattern-match before we touch the canonical form.
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
   if (N->getValueType(0) != MVT::i32)
     return SDValue();
 
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!N1C)
     return SDValue();
 
   uint32_t C1 = (uint32_t)N1C->getZExtValue();
   // Don't transform uxtb/uxth.
   if (C1 == 255 || C1 == 65535)
     return SDValue();
 
   SDNode *N0 = N->getOperand(0).getNode();
   if (!N0->hasOneUse())
     return SDValue();
 
   if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
     return SDValue();
 
   bool LeftShift = N0->getOpcode() == ISD::SHL;
 
   ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
   if (!N01C)
     return SDValue();
 
   uint32_t C2 = (uint32_t)N01C->getZExtValue();
   if (!C2 || C2 >= 32)
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
 
   // We have a pattern of the form "(and (shl x, c2) c1)" or
   // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
   // transform to a pair of shifts, to save materializing c1.
 
   // First pattern: right shift, and c1+1 is a power of two.
   // FIXME: Also check reversed pattern (left shift, and ~c1+1 is a power
   // of two).
   // FIXME: Use demanded bits?
   if (!LeftShift && isMask_32(C1)) {
     uint32_t C3 = countLeadingZeros(C1);
     if (C2 < C3) {
       SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
                                 DAG.getConstant(C3 - C2, DL, MVT::i32));
       return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
                          DAG.getConstant(C3, DL, MVT::i32));
     }
   }
 
   // Second pattern: left shift, and (c1>>c2)+1 is a power of two.
   // FIXME: Also check reversed pattern (right shift, and ~(c1<<c2)+1
   // is a power of two).
   // FIXME: Use demanded bits?
   if (LeftShift && isShiftedMask_32(C1)) {
     uint32_t C3 = countLeadingZeros(C1);
     if (C2 + C3 < 32 && C1 == ((-1U << (C2 + C3)) >> C3)) {
       SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
                                 DAG.getConstant(C2 + C3, DL, MVT::i32));
       return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
                         DAG.getConstant(C3, DL, MVT::i32));
     }
   }
 
   // FIXME: Transform "(and (shl x, c2) c1)" ->
   // "(shl (and x, c1>>c2), c2)" if "c1 >> c2" is a cheaper immediate than
   // c1.
   return SDValue();
 }
 
 static SDValue PerformANDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
   // Attempt to use immediate-form VBIC
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (BVN &&
       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatBitSize <= 64) {
       EVT VbicVT;
       SDValue Val = isNEONModifiedImm((~SplatBits).getZExtValue(),
                                       SplatUndef.getZExtValue(), SplatBitSize,
                                       DAG, dl, VbicVT, VT.is128BitVector(),
                                       OtherModImm);
       if (Val.getNode()) {
         SDValue Input =
           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
         SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
         return DAG.getNode(ISD::BITCAST, dl, VT, Vbic);
       }
     }
   }
 
   if (!Subtarget->isThumb1Only()) {
     // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
     if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
       return Result;
 
     if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
       return Result;
   }
 
   if (Subtarget->isThumb1Only())
     if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
       return Result;
 
   return SDValue();
 }
 
 // Try combining OR nodes to SMULWB, SMULWT.
 static SDValue PerformORCombineToSMULWBT(SDNode *OR,
                                          TargetLowering::DAGCombinerInfo &DCI,
                                          const ARMSubtarget *Subtarget) {
   if (!Subtarget->hasV6Ops() ||
       (Subtarget->isThumb() &&
        (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
     return SDValue();
 
   SDValue SRL = OR->getOperand(0);
   SDValue SHL = OR->getOperand(1);
 
   if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
     SRL = OR->getOperand(1);
     SHL = OR->getOperand(0);
   }
   if (!isSRL16(SRL) || !isSHL16(SHL))
     return SDValue();
 
   // The first operands to the shifts need to be the two results from the
   // same smul_lohi node.
   if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
        SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
     return SDValue();
 
   SDNode *SMULLOHI = SRL.getOperand(0).getNode();
   if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
       SHL.getOperand(0) != SDValue(SMULLOHI, 1))
     return SDValue();
 
   // Now we have:
   // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
   // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
   // For SMUWB the 16-bit value will signed extended somehow.
   // For SMULWT only the SRA is required.
   // Check both sides of SMUL_LOHI
   SDValue OpS16 = SMULLOHI->getOperand(0);
   SDValue OpS32 = SMULLOHI->getOperand(1);
 
   SelectionDAG &DAG = DCI.DAG;
   if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
     OpS16 = OpS32;
     OpS32 = SMULLOHI->getOperand(0);
   }
 
   SDLoc dl(OR);
   unsigned Opcode = 0;
   if (isS16(OpS16, DAG))
     Opcode = ARMISD::SMULWB;
   else if (isSRA16(OpS16)) {
     Opcode = ARMISD::SMULWT;
     OpS16 = OpS16->getOperand(0);
   }
   else
     return SDValue();
 
   SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
   DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
   return SDValue(OR, 0);
 }
 
 static SDValue PerformORCombineToBFI(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
   // BFI is only available on V6T2+
   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
     return SDValue();
 
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
   // 1) or (and A, mask), val => ARMbfi A, val, mask
   //      iff (val & mask) == val
   //
   // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
   //  2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
   //          && mask == ~mask2
   //  2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
   //          && ~mask == mask2
   //  (i.e., copy a bitfield value into another bitfield of the same width)
 
   if (VT != MVT::i32)
     return SDValue();
 
   SDValue N00 = N0.getOperand(0);
 
   // The value and the mask need to be constants so we can verify this is
   // actually a bitfield set. If the mask is 0xffff, we can do better
   // via a movt instruction, so don't use BFI in that case.
   SDValue MaskOp = N0.getOperand(1);
   ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(MaskOp);
   if (!MaskC)
     return SDValue();
   unsigned Mask = MaskC->getZExtValue();
   if (Mask == 0xffff)
     return SDValue();
   SDValue Res;
   // Case (1): or (and A, mask), val => ARMbfi A, val, mask
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N1C) {
     unsigned Val = N1C->getZExtValue();
     if ((Val & ~Mask) != Val)
       return SDValue();
 
     if (ARM::isBitFieldInvertedMask(Mask)) {
       Val >>= countTrailingZeros(~Mask);
 
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
                         DAG.getConstant(Val, DL, MVT::i32),
                         DAG.getConstant(Mask, DL, MVT::i32));
 
       DCI.CombineTo(N, Res, false);
       // Return value from the original node to inform the combiner than N is
       // now dead.
       return SDValue(N, 0);
     }
   } else if (N1.getOpcode() == ISD::AND) {
     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
     if (!N11C)
       return SDValue();
     unsigned Mask2 = N11C->getZExtValue();
 
     // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
     // as is to match.
     if (ARM::isBitFieldInvertedMask(Mask) &&
         (Mask == ~Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
       // so use that when it's available.
       if (Subtarget->hasDSP() &&
           (Mask == 0xffff || Mask == 0xffff0000))
         return SDValue();
       // 2a
       unsigned amt = countTrailingZeros(Mask2);
       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
                         DAG.getConstant(amt, DL, MVT::i32));
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
                         DAG.getConstant(Mask, DL, MVT::i32));
       DCI.CombineTo(N, Res, false);
       // Return value from the original node to inform the combiner than N is
       // now dead.
       return SDValue(N, 0);
     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
                (~Mask == Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
       // so use that when it's available.
       if (Subtarget->hasDSP() &&
           (Mask2 == 0xffff || Mask2 == 0xffff0000))
         return SDValue();
       // 2b
       unsigned lsb = countTrailingZeros(Mask);
       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
                         DAG.getConstant(lsb, DL, MVT::i32));
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
                         DAG.getConstant(Mask2, DL, MVT::i32));
       DCI.CombineTo(N, Res, false);
       // Return value from the original node to inform the combiner than N is
       // now dead.
       return SDValue(N, 0);
     }
   }
 
   if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
       N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
       ARM::isBitFieldInvertedMask(~Mask)) {
     // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
     // where lsb(mask) == #shamt and masked bits of B are known zero.
     SDValue ShAmt = N00.getOperand(1);
     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
     unsigned LSB = countTrailingZeros(Mask);
     if (ShAmtC != LSB)
       return SDValue();
 
     Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
                       DAG.getConstant(~Mask, DL, MVT::i32));
 
     DCI.CombineTo(N, Res, false);
     // Return value from the original node to inform the combiner than N is
     // now dead.
     return SDValue(N, 0);
   }
 
   return SDValue();
 }
 
 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
 static SDValue PerformORCombine(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const ARMSubtarget *Subtarget) {
   // Attempt to use immediate-form VORR
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (BVN && Subtarget->hasNEON() &&
       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     if (SplatBitSize <= 64) {
       EVT VorrVT;
       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
                                       SplatUndef.getZExtValue(), SplatBitSize,
                                       DAG, dl, VorrVT, VT.is128BitVector(),
                                       OtherModImm);
       if (Val.getNode()) {
         SDValue Input =
           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
         SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
         return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
       }
     }
   }
 
   if (!Subtarget->isThumb1Only()) {
     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
       return Result;
     if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
       return Result;
   }
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
   if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
       DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
 
     // The code below optimizes (or (and X, Y), Z).
     // The AND operand needs to have a single user to make these optimizations
     // profitable.
     if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
       return SDValue();
 
     APInt SplatUndef;
     unsigned SplatBitSize;
     bool HasAnyUndefs;
 
     APInt SplatBits0, SplatBits1;
     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
     BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
     // Ensure that the second operand of both ands are constants
     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
                                       HasAnyUndefs) && !HasAnyUndefs) {
         if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
                                           HasAnyUndefs) && !HasAnyUndefs) {
             // Ensure that the bit width of the constants are the same and that
             // the splat arguments are logical inverses as per the pattern we
             // are trying to simplify.
             if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
                 SplatBits0 == ~SplatBits1) {
                 // Canonicalize the vector type to make instruction selection
                 // simpler.
                 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
                 SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
                                              N0->getOperand(1),
                                              N0->getOperand(0),
                                              N1->getOperand(0));
                 return DAG.getNode(ISD::BITCAST, dl, VT, Result);
             }
         }
     }
   }
 
   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
   // reasonable.
   if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
     if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
       return Res;
   }
 
   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
     return Result;
 
   return SDValue();
 }
 
 static SDValue PerformXORCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   if (!Subtarget->isThumb1Only()) {
     // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
       return Result;
 
     if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
       return Result;
   }
 
   return SDValue();
 }
 
 // ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
 // and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
 // their position in "to" (Rd).
 static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
   assert(N->getOpcode() == ARMISD::BFI);
 
   SDValue From = N->getOperand(1);
   ToMask = ~cast<ConstantSDNode>(N->getOperand(2))->getAPIntValue();
   FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.countPopulation());
 
   // If the Base came from a SHR #C, we can deduce that it is really testing bit
   // #C in the base of the SHR.
   if (From->getOpcode() == ISD::SRL &&
       isa<ConstantSDNode>(From->getOperand(1))) {
     APInt Shift = cast<ConstantSDNode>(From->getOperand(1))->getAPIntValue();
     assert(Shift.getLimitedValue() < 32 && "Shift too large!");
     FromMask <<= Shift.getLimitedValue(31);
     From = From->getOperand(0);
   }
 
   return From;
 }
 
 // If A and B contain one contiguous set of bits, does A | B == A . B?
 //
 // Neither A nor B must be zero.
 static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
   unsigned LastActiveBitInA =  A.countTrailingZeros();
   unsigned FirstActiveBitInB = B.getBitWidth() - B.countLeadingZeros() - 1;
   return LastActiveBitInA - 1 == FirstActiveBitInB;
 }
 
 static SDValue FindBFIToCombineWith(SDNode *N) {
   // We have a BFI in N. Follow a possible chain of BFIs and find a BFI it can combine with,
   // if one exists.
   APInt ToMask, FromMask;
   SDValue From = ParseBFI(N, ToMask, FromMask);
   SDValue To = N->getOperand(0);
 
   // Now check for a compatible BFI to merge with. We can pass through BFIs that
   // aren't compatible, but not if they set the same bit in their destination as
   // we do (or that of any BFI we're going to combine with).
   SDValue V = To;
   APInt CombinedToMask = ToMask;
   while (V.getOpcode() == ARMISD::BFI) {
     APInt NewToMask, NewFromMask;
     SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
     if (NewFrom != From) {
       // This BFI has a different base. Keep going.
       CombinedToMask |= NewToMask;
       V = V.getOperand(0);
       continue;
     }
 
     // Do the written bits conflict with any we've seen so far?
     if ((NewToMask & CombinedToMask).getBoolValue())
       // Conflicting bits - bail out because going further is unsafe.
       return SDValue();
 
     // Are the new bits contiguous when combined with the old bits?
     if (BitsProperlyConcatenate(ToMask, NewToMask) &&
         BitsProperlyConcatenate(FromMask, NewFromMask))
       return V;
     if (BitsProperlyConcatenate(NewToMask, ToMask) &&
         BitsProperlyConcatenate(NewFromMask, FromMask))
       return V;
 
     // We've seen a write to some bits, so track it.
     CombinedToMask |= NewToMask;
     // Keep going...
     V = V.getOperand(0);
   }
 
   return SDValue();
 }
 
 static SDValue PerformBFICombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N1 = N->getOperand(1);
   if (N1.getOpcode() == ISD::AND) {
     // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
     // the bits being cleared by the AND are not demanded by the BFI.
     ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
     if (!N11C)
       return SDValue();
     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
     unsigned LSB = countTrailingZeros(~InvMask);
     unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
     assert(Width <
                static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
            "undefined behavior");
     unsigned Mask = (1u << Width) - 1;
     unsigned Mask2 = N11C->getZExtValue();
     if ((Mask & (~Mask2)) == 0)
       return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
                              N->getOperand(0), N1.getOperand(0),
                              N->getOperand(2));
   } else if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
     // We have a BFI of a BFI. Walk up the BFI chain to see how long it goes.
     // Keep track of any consecutive bits set that all come from the same base
     // value. We can combine these together into a single BFI.
     SDValue CombineBFI = FindBFIToCombineWith(N);
     if (CombineBFI == SDValue())
       return SDValue();
 
     // We've found a BFI.
     APInt ToMask1, FromMask1;
     SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
 
     APInt ToMask2, FromMask2;
     SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
     assert(From1 == From2);
     (void)From2;
 
     // First, unlink CombineBFI.
     DCI.DAG.ReplaceAllUsesWith(CombineBFI, CombineBFI.getOperand(0));
     // Then create a new BFI, combining the two together.
     APInt NewFromMask = FromMask1 | FromMask2;
     APInt NewToMask = ToMask1 | ToMask2;
 
     EVT VT = N->getValueType(0);
     SDLoc dl(N);
 
     if (NewFromMask[0] == 0)
       From1 = DCI.DAG.getNode(
         ISD::SRL, dl, VT, From1,
         DCI.DAG.getConstant(NewFromMask.countTrailingZeros(), dl, VT));
     return DCI.DAG.getNode(ARMISD::BFI, dl, VT, N->getOperand(0), From1,
                            DCI.DAG.getConstant(~NewToMask, dl, VT));
   }
   return SDValue();
 }
 
 /// PerformVMOVRRDCombine - Target-specific dag combine xforms for
 /// ARMISD::VMOVRRD.
 static SDValue PerformVMOVRRDCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
   // vmovrrd(vmovdrr x, y) -> x,y
   SDValue InDouble = N->getOperand(0);
   if (InDouble.getOpcode() == ARMISD::VMOVDRR && !Subtarget->isFPOnlySP())
     return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
 
   // vmovrrd(load f64) -> (load i32), (load i32)
   SDNode *InNode = InDouble.getNode();
   if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
       InNode->getValueType(0) == MVT::f64 &&
       InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
       !cast<LoadSDNode>(InNode)->isVolatile()) {
     // TODO: Should this be done for non-FrameIndex operands?
     LoadSDNode *LD = cast<LoadSDNode>(InNode);
 
     SelectionDAG &DAG = DCI.DAG;
     SDLoc DL(LD);
     SDValue BasePtr = LD->getBasePtr();
     SDValue NewLD1 =
         DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
                     LD->getAlignment(), LD->getMemOperand()->getFlags());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, DL, MVT::i32));
     SDValue NewLD2 = DAG.getLoad(
         MVT::i32, DL, NewLD1.getValue(1), OffsetPtr, LD->getPointerInfo(),
         std::min(4U, LD->getAlignment() / 2), LD->getMemOperand()->getFlags());
 
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
     if (DCI.DAG.getDataLayout().isBigEndian())
       std::swap (NewLD1, NewLD2);
     SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
     return Result;
   }
 
   return SDValue();
 }
 
 /// PerformVMOVDRRCombine - Target-specific dag combine xforms for
 /// ARMISD::VMOVDRR.  This is also used for BUILD_VECTORs with 2 operands.
 static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
   // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   if (Op0.getOpcode() == ISD::BITCAST)
     Op0 = Op0.getOperand(0);
   if (Op1.getOpcode() == ISD::BITCAST)
     Op1 = Op1.getOperand(0);
   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
       Op0.getNode() == Op1.getNode() &&
       Op0.getResNo() == 0 && Op1.getResNo() == 1)
     return DAG.getNode(ISD::BITCAST, SDLoc(N),
                        N->getValueType(0), Op0.getOperand(0));
   return SDValue();
 }
 
 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
 /// i64 vector to have f64 elements, since the value can then be loaded
 /// directly into a VFP register.
 static bool hasNormalLoadOperand(SDNode *N) {
   unsigned NumElts = N->getValueType(0).getVectorNumElements();
   for (unsigned i = 0; i < NumElts; ++i) {
     SDNode *Elt = N->getOperand(i).getNode();
     if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
       return true;
   }
   return false;
 }
 
 /// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
 /// ISD::BUILD_VECTOR.
 static SDValue PerformBUILD_VECTORCombine(SDNode *N,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           const ARMSubtarget *Subtarget) {
   // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
   // VMOVRRD is introduced when legalizing i64 types.  It forces the i64 value
   // into a pair of GPRs, which is fine when the value is used as a scalar,
   // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
   SelectionDAG &DAG = DCI.DAG;
   if (N->getNumOperands() == 2)
     if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
       return RV;
 
   // Load i64 elements as f64 values so that type legalization does not split
   // them up into i32 values.
   EVT VT = N->getValueType(0);
   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
     return SDValue();
   SDLoc dl(N);
   SmallVector<SDValue, 8> Ops;
   unsigned NumElts = VT.getVectorNumElements();
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
     Ops.push_back(V);
     // Make the DAGCombiner fold the bitcast.
     DCI.AddToWorklist(V.getNode());
   }
   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
   SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
 }
 
 /// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
 static SDValue
 PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
   // At that time, we may have inserted bitcasts from integer to float.
   // If these bitcasts have survived DAGCombine, change the lowering of this
   // BUILD_VECTOR in something more vector friendly, i.e., that does not
   // force to use floating point types.
 
   // Make sure we can change the type of the vector.
   // This is possible iff:
   // 1. The vector is only used in a bitcast to a integer type. I.e.,
   //    1.1. Vector is used only once.
   //    1.2. Use is a bit convert to an integer type.
   // 2. The size of its operands are 32-bits (64-bits are not legal).
   EVT VT = N->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
 
   // Check 1.1. and 2.
   if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
     return SDValue();
 
   // By construction, the input type must be float.
   assert(EltVT == MVT::f32 && "Unexpected type!");
 
   // Check 1.2.
   SDNode *Use = *N->use_begin();
   if (Use->getOpcode() != ISD::BITCAST ||
       Use->getValueType(0).isFloatingPoint())
     return SDValue();
 
   // Check profitability.
   // Model is, if more than half of the relevant operands are bitcast from
   // i32, turn the build_vector into a sequence of insert_vector_elt.
   // Relevant operands are everything that is not statically
   // (i.e., at compile time) bitcasted.
   unsigned NumOfBitCastedElts = 0;
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumOfRelevantElts = NumElts;
   for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
     SDValue Elt = N->getOperand(Idx);
     if (Elt->getOpcode() == ISD::BITCAST) {
       // Assume only bit cast to i32 will go away.
       if (Elt->getOperand(0).getValueType() == MVT::i32)
         ++NumOfBitCastedElts;
     } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
       // Constants are statically casted, thus do not count them as
       // relevant operands.
       --NumOfRelevantElts;
   }
 
   // Check if more than half of the elements require a non-free bitcast.
   if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
   // Create the new vector type.
   EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
   // Check if the type is legal.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isTypeLegal(VecVT))
     return SDValue();
 
   // Combine:
   // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
   // => BITCAST INSERT_VECTOR_ELT
   //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
   //                      (BITCAST EN), N.
   SDValue Vec = DAG.getUNDEF(VecVT);
   SDLoc dl(N);
   for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
     SDValue V = N->getOperand(Idx);
     if (V.isUndef())
       continue;
     if (V.getOpcode() == ISD::BITCAST &&
         V->getOperand(0).getValueType() == MVT::i32)
       // Fold obvious case.
       V = V.getOperand(0);
     else {
       V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
       // Make the DAGCombiner fold the bitcasts.
       DCI.AddToWorklist(V.getNode());
     }
     SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
     Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
   }
   Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
   // Make the DAGCombiner fold the bitcasts.
   DCI.AddToWorklist(Vec.getNode());
   return Vec;
 }
 
 /// PerformInsertEltCombine - Target-specific dag combine xforms for
 /// ISD::INSERT_VECTOR_ELT.
 static SDValue PerformInsertEltCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI) {
   // Bitcast an i64 load inserted into a vector to f64.
   // Otherwise, the i64 value will be legalized to a pair of i32 values.
   EVT VT = N->getValueType(0);
   SDNode *Elt = N->getOperand(1).getNode();
   if (VT.getVectorElementType() != MVT::i64 ||
       !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
   SDLoc dl(N);
   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
                                  VT.getVectorNumElements());
   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
   // Make the DAGCombiner fold the bitcasts.
   DCI.AddToWorklist(Vec.getNode());
   DCI.AddToWorklist(V.getNode());
   SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
                                Vec, V, N->getOperand(2));
   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
 }
 
 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
 /// ISD::VECTOR_SHUFFLE.
 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
   // The LLVM shufflevector instruction does not require the shuffle mask
   // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
   // have that requirement.  When translating to ISD::VECTOR_SHUFFLE, if the
   // operands do not match the mask length, they are extended by concatenating
   // them with undef vectors.  That is probably the right thing for other
   // targets, but for NEON it is better to concatenate two double-register
   // size vector operands into a single quad-register size vector.  Do that
   // transformation here:
   //   shuffle(concat(v1, undef), concat(v2, undef)) ->
   //   shuffle(concat(v1, v2), undef)
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
       Op1.getOpcode() != ISD::CONCAT_VECTORS ||
       Op0.getNumOperands() != 2 ||
       Op1.getNumOperands() != 2)
     return SDValue();
   SDValue Concat0Op1 = Op0.getOperand(1);
   SDValue Concat1Op1 = Op1.getOperand(1);
   if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
     return SDValue();
   // Skip the transformation if any of the types are illegal.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = N->getValueType(0);
   if (!TLI.isTypeLegal(VT) ||
       !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
       !TLI.isTypeLegal(Concat1Op1.getValueType()))
     return SDValue();
 
   SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
                                   Op0.getOperand(0), Op1.getOperand(0));
   // Translate the shuffle mask.
   SmallVector<int, 16> NewMask;
   unsigned NumElts = VT.getVectorNumElements();
   unsigned HalfElts = NumElts/2;
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
   for (unsigned n = 0; n < NumElts; ++n) {
     int MaskElt = SVN->getMaskElt(n);
     int NewElt = -1;
     if (MaskElt < (int)HalfElts)
       NewElt = MaskElt;
     else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
       NewElt = HalfElts + MaskElt - NumElts;
     NewMask.push_back(NewElt);
   }
   return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
                               DAG.getUNDEF(VT), NewMask);
 }
 
 /// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
 /// NEON load/store intrinsics, and generic vector load/stores, to merge
 /// base address updates.
 /// For generic load/stores, the memory type is assumed to be a vector.
 /// The caller is assumed to have checked legality.
 static SDValue CombineBaseUpdate(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
   const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
                             N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
   const bool isStore = N->getOpcode() == ISD::STORE;
   const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
   SDValue Addr = N->getOperand(AddrOpIdx);
   MemSDNode *MemN = cast<MemSDNode>(N);
   SDLoc dl(N);
 
   // Search for a use of the address operand that is an increment.
   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
          UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
     SDNode *User = *UI;
     if (User->getOpcode() != ISD::ADD ||
         UI.getUse().getResNo() != Addr.getResNo())
       continue;
 
     // Check that the add is independent of the load/store.  Otherwise, folding
     // it would create a cycle.
     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
       continue;
 
     // Find the new opcode for the updating load/store.
     bool isLoadOp = true;
     bool isLaneOp = false;
     unsigned NewOpc = 0;
     unsigned NumVecs = 0;
     if (isIntrinsic) {
       unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
       switch (IntNo) {
       default: llvm_unreachable("unexpected intrinsic for Neon base update");
       case Intrinsic::arm_neon_vld1:     NewOpc = ARMISD::VLD1_UPD;
         NumVecs = 1; break;
       case Intrinsic::arm_neon_vld2:     NewOpc = ARMISD::VLD2_UPD;
         NumVecs = 2; break;
       case Intrinsic::arm_neon_vld3:     NewOpc = ARMISD::VLD3_UPD;
         NumVecs = 3; break;
       case Intrinsic::arm_neon_vld4:     NewOpc = ARMISD::VLD4_UPD;
         NumVecs = 4; break;
       case Intrinsic::arm_neon_vld2dup:
       case Intrinsic::arm_neon_vld3dup:
       case Intrinsic::arm_neon_vld4dup:
         // TODO: Support updating VLDxDUP nodes. For now, we just skip
         // combining base updates for such intrinsics.
         continue;
       case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
         NumVecs = 2; isLaneOp = true; break;
       case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
         NumVecs = 3; isLaneOp = true; break;
       case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
         NumVecs = 4; isLaneOp = true; break;
       case Intrinsic::arm_neon_vst1:     NewOpc = ARMISD::VST1_UPD;
         NumVecs = 1; isLoadOp = false; break;
       case Intrinsic::arm_neon_vst2:     NewOpc = ARMISD::VST2_UPD;
         NumVecs = 2; isLoadOp = false; break;
       case Intrinsic::arm_neon_vst3:     NewOpc = ARMISD::VST3_UPD;
         NumVecs = 3; isLoadOp = false; break;
       case Intrinsic::arm_neon_vst4:     NewOpc = ARMISD::VST4_UPD;
         NumVecs = 4; isLoadOp = false; break;
       case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
         NumVecs = 2; isLoadOp = false; isLaneOp = true; break;
       case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
         NumVecs = 3; isLoadOp = false; isLaneOp = true; break;
       case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
         NumVecs = 4; isLoadOp = false; isLaneOp = true; break;
       }
     } else {
       isLaneOp = true;
       switch (N->getOpcode()) {
       default: llvm_unreachable("unexpected opcode for Neon base update");
       case ARMISD::VLD1DUP: NewOpc = ARMISD::VLD1DUP_UPD; NumVecs = 1; break;
       case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
       case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
       case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
       case ISD::LOAD:       NewOpc = ARMISD::VLD1_UPD;
         NumVecs = 1; isLaneOp = false; break;
       case ISD::STORE:      NewOpc = ARMISD::VST1_UPD;
         NumVecs = 1; isLaneOp = false; isLoadOp = false; break;
       }
     }
 
     // Find the size of memory referenced by the load/store.
     EVT VecTy;
     if (isLoadOp) {
       VecTy = N->getValueType(0);
     } else if (isIntrinsic) {
       VecTy = N->getOperand(AddrOpIdx+1).getValueType();
     } else {
       assert(isStore && "Node has to be a load, a store, or an intrinsic!");
       VecTy = N->getOperand(1).getValueType();
     }
 
     unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
     if (isLaneOp)
       NumBytes /= VecTy.getVectorNumElements();
 
     // If the increment is a constant, it must match the memory ref size.
     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
     ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
     if (NumBytes >= 3 * 16 && (!CInc || CInc->getZExtValue() != NumBytes)) {
       // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
       // separate instructions that make it harder to use a non-constant update.
       continue;
     }
 
     // OK, we found an ADD we can fold into the base update.
     // Now, create a _UPD node, taking care of not breaking alignment.
 
     EVT AlignedVecTy = VecTy;
     unsigned Alignment = MemN->getAlignment();
 
     // If this is a less-than-standard-aligned load/store, change the type to
     // match the standard alignment.
     // The alignment is overlooked when selecting _UPD variants; and it's
     // easier to introduce bitcasts here than fix that.
     // There are 3 ways to get to this base-update combine:
     // - intrinsics: they are assumed to be properly aligned (to the standard
     //   alignment of the memory type), so we don't need to do anything.
     // - ARMISD::VLDx nodes: they are only generated from the aforementioned
     //   intrinsics, so, likewise, there's nothing to do.
     // - generic load/store instructions: the alignment is specified as an
     //   explicit operand, rather than implicitly as the standard alignment
     //   of the memory type (like the intrisics).  We need to change the
     //   memory type to match the explicit alignment.  That way, we don't
     //   generate non-standard-aligned ARMISD::VLDx nodes.
     if (isa<LSBaseSDNode>(N)) {
       if (Alignment == 0)
         Alignment = 1;
       if (Alignment < VecTy.getScalarSizeInBits() / 8) {
         MVT EltTy = MVT::getIntegerVT(Alignment * 8);
         assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
         assert(!isLaneOp && "Unexpected generic load/store lane.");
         unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
         AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
       }
       // Don't set an explicit alignment on regular load/stores that we want
       // to transform to VLD/VST 1_UPD nodes.
       // This matches the behavior of regular load/stores, which only get an
       // explicit alignment if the MMO alignment is larger than the standard
       // alignment of the memory type.
       // Intrinsics, however, always get an explicit alignment, set to the
       // alignment of the MMO.
       Alignment = 1;
     }
 
     // Create the new updating load/store node.
     // First, create an SDVTList for the new updating node's results.
     EVT Tys[6];
     unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
     unsigned n;
     for (n = 0; n < NumResultVecs; ++n)
       Tys[n] = AlignedVecTy;
     Tys[n++] = MVT::i32;
     Tys[n] = MVT::Other;
     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs+2));
 
     // Then, gather the new node's operands.
     SmallVector<SDValue, 8> Ops;
     Ops.push_back(N->getOperand(0)); // incoming chain
     Ops.push_back(N->getOperand(AddrOpIdx));
     Ops.push_back(Inc);
 
     if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
       // Try to match the intrinsic's signature
       Ops.push_back(StN->getValue());
     } else {
       // Loads (and of course intrinsics) match the intrinsics' signature,
       // so just add all but the alignment operand.
       for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands() - 1; ++i)
         Ops.push_back(N->getOperand(i));
     }
 
     // For all node types, the alignment operand is always the last one.
     Ops.push_back(DAG.getConstant(Alignment, dl, MVT::i32));
 
     // If this is a non-standard-aligned STORE, the penultimate operand is the
     // stored value.  Bitcast it to the aligned type.
     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
       SDValue &StVal = Ops[Ops.size()-2];
       StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
     }
 
     EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
                                            MemN->getMemOperand());
 
     // Update the uses.
     SmallVector<SDValue, 5> NewResults;
     for (unsigned i = 0; i < NumResultVecs; ++i)
       NewResults.push_back(SDValue(UpdN.getNode(), i));
 
     // If this is an non-standard-aligned LOAD, the first result is the loaded
     // value.  Bitcast it to the expected result type.
     if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
       SDValue &LdVal = NewResults[0];
       LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
     }
 
     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
     DCI.CombineTo(N, NewResults);
     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
 
     break;
   }
   return SDValue();
 }
 
 static SDValue PerformVLDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
   return CombineBaseUpdate(N, DCI);
 }
 
 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
 /// return true.
 static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
   // vldN-dup instructions only support 64-bit vectors for N > 1.
   if (!VT.is64BitVector())
     return false;
 
   // Check if the VDUPLANE operand is a vldN-dup intrinsic.
   SDNode *VLD = N->getOperand(0).getNode();
   if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
     return false;
   unsigned NumVecs = 0;
   unsigned NewOpc = 0;
   unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
   if (IntNo == Intrinsic::arm_neon_vld2lane) {
     NumVecs = 2;
     NewOpc = ARMISD::VLD2DUP;
   } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
     NumVecs = 3;
     NewOpc = ARMISD::VLD3DUP;
   } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
     NumVecs = 4;
     NewOpc = ARMISD::VLD4DUP;
   } else {
     return false;
   }
 
   // First check that all the vldN-lane uses are VDUPLANEs and that the lane
   // numbers match the load.
   unsigned VLDLaneNo =
     cast<ConstantSDNode>(VLD->getOperand(NumVecs+3))->getZExtValue();
   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
        UI != UE; ++UI) {
     // Ignore uses of the chain result.
     if (UI.getUse().getResNo() == NumVecs)
       continue;
     SDNode *User = *UI;
     if (User->getOpcode() != ARMISD::VDUPLANE ||
         VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
       return false;
   }
 
   // Create the vldN-dup node.
   EVT Tys[5];
   unsigned n;
   for (n = 0; n < NumVecs; ++n)
     Tys[n] = VT;
   Tys[n] = MVT::Other;
   SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumVecs+1));
   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
                                            Ops, VLDMemInt->getMemoryVT(),
                                            VLDMemInt->getMemOperand());
 
   // Update the uses.
   for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
        UI != UE; ++UI) {
     unsigned ResNo = UI.getUse().getResNo();
     // Ignore uses of the chain result.
     if (ResNo == NumVecs)
       continue;
     SDNode *User = *UI;
     DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
   }
 
   // Now the vldN-lane intrinsic is dead except for its chain result.
   // Update uses of the chain.
   std::vector<SDValue> VLDDupResults;
   for (unsigned n = 0; n < NumVecs; ++n)
     VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
   VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
   DCI.CombineTo(VLD, VLDDupResults);
 
   return true;
 }
 
 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
 /// ARMISD::VDUPLANE.
 static SDValue PerformVDUPLANECombine(SDNode *N,
                                       TargetLowering::DAGCombinerInfo &DCI) {
   SDValue Op = N->getOperand(0);
 
   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
   if (CombineVLDDUP(N, DCI))
     return SDValue(N, 0);
 
   // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
   // redundant.  Ignore bit_converts for now; element sizes are checked below.
   while (Op.getOpcode() == ISD::BITCAST)
     Op = Op.getOperand(0);
   if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
     return SDValue();
 
   // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
   unsigned EltSize = Op.getScalarValueSizeInBits();
   // The canonical VMOV for a zero vector uses a 32-bit element size.
   unsigned Imm = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   unsigned EltBits;
   if (ARM_AM::decodeNEONModImm(Imm, EltBits) == 0)
     EltSize = 8;
   EVT VT = N->getValueType(0);
   if (EltSize > VT.getScalarSizeInBits())
     return SDValue();
 
   return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
 }
 
 /// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
 static SDValue PerformVDUPCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
   SDValue Op = N->getOperand(0);
 
   // Match VDUP(LOAD) -> VLD1DUP.
   // We match this pattern here rather than waiting for isel because the
   // transform is only legal for unindexed loads.
   LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
   if (LD && Op.hasOneUse() && LD->isUnindexed() &&
       LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
     SDValue Ops[] = { LD->getOperand(0), LD->getOperand(1),
                       DAG.getConstant(LD->getAlignment(), SDLoc(N), MVT::i32) };
     SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
     SDValue VLDDup = DAG.getMemIntrinsicNode(ARMISD::VLD1DUP, SDLoc(N), SDTys,
                                              Ops, LD->getMemoryVT(),
                                              LD->getMemOperand());
     DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
     return VLDDup;
   }
 
   return SDValue();
 }
 
 static SDValue PerformLOADCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI) {
   EVT VT = N->getValueType(0);
 
   // If this is a legal vector load, try to combine it into a VLD1_UPD.
   if (ISD::isNormalLoad(N) && VT.isVector() &&
       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return CombineBaseUpdate(N, DCI);
 
   return SDValue();
 }
 
 /// PerformSTORECombine - Target-specific dag combine xforms for
 /// ISD::STORE.
 static SDValue PerformSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI) {
   StoreSDNode *St = cast<StoreSDNode>(N);
   if (St->isVolatile())
     return SDValue();
 
   // Optimize trunc store (of multiple scalars) to shuffle and store.  First,
   // pack all of the elements in one place.  Next, store to memory in fewer
   // chunks.
   SDValue StVal = St->getValue();
   EVT VT = StVal.getValueType();
   if (St->isTruncatingStore() && VT.isVector()) {
     SelectionDAG &DAG = DCI.DAG;
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     EVT StVT = St->getMemoryVT();
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
     unsigned FromEltSz = VT.getScalarSizeInBits();
     unsigned ToEltSz = StVT.getScalarSizeInBits();
 
     // From, To sizes and ElemCount must be pow of two
     if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
 
     // We are going to use the original vector elt for storing.
     // Accumulated smaller vector elements must be a multiple of the store size.
     if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
 
     unsigned SizeRatio  = FromEltSz / ToEltSz;
     assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
 
     // Create a type on which we perform the shuffle.
     EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
                                      NumElems*SizeRatio);
     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
     SDLoc DL(St);
     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i < NumElems; ++i)
       ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
                           ? (i + 1) * SizeRatio - 1
                           : i * SizeRatio;
 
     // Can't shuffle using an illegal type.
     if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
 
     SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
                                 DAG.getUNDEF(WideVec.getValueType()),
                                 ShuffleVec);
     // At this point all of the data is stored at the bottom of the
     // register. We now need to save it to mem.
 
     // Find the largest store unit
     MVT StoreType = MVT::i8;
     for (MVT Tp : MVT::integer_valuetypes()) {
       if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
         StoreType = Tp;
     }
     // Didn't find a legal store type.
     if (!TLI.isTypeLegal(StoreType))
       return SDValue();
 
     // Bitcast the original vector into a vector of store-size units
     EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
             StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
     assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
     SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
     SmallVector<SDValue, 8> Chains;
     SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
                                         TLI.getPointerTy(DAG.getDataLayout()));
     SDValue BasePtr = St->getBasePtr();
 
     // Perform one or more big stores into memory.
     unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
     for (unsigned I = 0; I < E; I++) {
       SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
                                    StoreType, ShuffWide,
                                    DAG.getIntPtrConstant(I, DL));
       SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
                                 St->getPointerInfo(), St->getAlignment(),
                                 St->getMemOperand()->getFlags());
       BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
                             Increment);
       Chains.push_back(Ch);
     }
     return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   }
 
   if (!ISD::isNormalStore(St))
     return SDValue();
 
   // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
   // ARM stores of arguments in the same cache line.
   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
       StVal.getNode()->hasOneUse()) {
     SelectionDAG  &DAG = DCI.DAG;
     bool isBigEndian = DAG.getDataLayout().isBigEndian();
     SDLoc DL(St);
     SDValue BasePtr = St->getBasePtr();
     SDValue NewST1 = DAG.getStore(
         St->getChain(), DL, StVal.getNode()->getOperand(isBigEndian ? 1 : 0),
         BasePtr, St->getPointerInfo(), St->getAlignment(),
         St->getMemOperand()->getFlags());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, DL, MVT::i32));
     return DAG.getStore(NewST1.getValue(0), DL,
                         StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
                         OffsetPtr, St->getPointerInfo(),
                         std::min(4U, St->getAlignment() / 2),
                         St->getMemOperand()->getFlags());
   }
 
   if (StVal.getValueType() == MVT::i64 &&
       StVal.getNode()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
 
     // Bitcast an i64 store extracted from a vector to f64.
     // Otherwise, the i64 value will be legalized to a pair of i32 values.
     SelectionDAG &DAG = DCI.DAG;
     SDLoc dl(StVal);
     SDValue IntVec = StVal.getOperand(0);
     EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
                                    IntVec.getValueType().getVectorNumElements());
     SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
     SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                                  Vec, StVal.getOperand(1));
     dl = SDLoc(N);
     SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
     // Make the DAGCombiner fold the bitcasts.
     DCI.AddToWorklist(Vec.getNode());
     DCI.AddToWorklist(ExtElt.getNode());
     DCI.AddToWorklist(V.getNode());
     return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
                         St->getPointerInfo(), St->getAlignment(),
                         St->getMemOperand()->getFlags(), St->getAAInfo());
   }
 
   // If this is a legal vector store, try to combine it into a VST1_UPD.
   if (ISD::isNormalStore(N) && VT.isVector() &&
       DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return CombineBaseUpdate(N, DCI);
 
   return SDValue();
 }
 
 /// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
 /// can replace combinations of VMUL and VCVT (floating-point to integer)
 /// when the VMUL has a constant operand that is a power of 2.
 ///
 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
 ///  vmul.f32        d16, d17, d16
 ///  vcvt.s32.f32    d16, d16
 /// becomes:
 ///  vcvt.s32.f32    d16, d16, #3
 static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG,
                                   const ARMSubtarget *Subtarget) {
   if (!Subtarget->hasNEON())
     return SDValue();
 
   SDValue Op = N->getOperand(0);
   if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
       Op.getOpcode() != ISD::FMUL)
     return SDValue();
 
   SDValue ConstVec = Op->getOperand(1);
   if (!isa<BuildVectorSDNode>(ConstVec))
     return SDValue();
 
   MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
   uint32_t FloatBits = FloatTy.getSizeInBits();
   MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
   uint32_t IntBits = IntTy.getSizeInBits();
   unsigned NumLanes = Op.getValueType().getVectorNumElements();
   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
     // These instructions only exist converting from f32 to i32. We can handle
     // smaller integers by generating an extra truncate, but larger ones would
     // be lossy. We also can't handle more then 4 lanes, since these intructions
     // only support v2i32/v4i32 types.
     return SDValue();
   }
 
   BitVector UndefElements;
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
   if (C == -1 || C == 0 || C > 32)
     return SDValue();
 
   SDLoc dl(N);
   bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
     Intrinsic::arm_neon_vcvtfp2fxu;
   SDValue FixConv = DAG.getNode(
       ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
       DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
       DAG.getConstant(C, dl, MVT::i32));
 
   if (IntBits < FloatBits)
     FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
 
   return FixConv;
 }
 
 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
 /// can replace combinations of VCVT (integer to floating-point) and VDIV
 /// when the VDIV has a constant operand that is a power of 2.
 ///
 /// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
 ///  vcvt.f32.s32    d16, d16
 ///  vdiv.f32        d16, d17, d16
 /// becomes:
 ///  vcvt.f32.s32    d16, d16, #3
 static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
                                   const ARMSubtarget *Subtarget) {
   if (!Subtarget->hasNEON())
     return SDValue();
 
   SDValue Op = N->getOperand(0);
   unsigned OpOpcode = Op.getNode()->getOpcode();
   if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
       (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
     return SDValue();
 
   SDValue ConstVec = N->getOperand(1);
   if (!isa<BuildVectorSDNode>(ConstVec))
     return SDValue();
 
   MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
   uint32_t FloatBits = FloatTy.getSizeInBits();
   MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
   uint32_t IntBits = IntTy.getSizeInBits();
   unsigned NumLanes = Op.getValueType().getVectorNumElements();
   if (FloatBits != 32 || IntBits > 32 || NumLanes > 4) {
     // These instructions only exist converting from i32 to f32. We can handle
     // smaller integers by generating an extra extend, but larger ones would
     // be lossy. We also can't handle more then 4 lanes, since these intructions
     // only support v2i32/v4i32 types.
     return SDValue();
   }
 
   BitVector UndefElements;
   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
   int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
   if (C == -1 || C == 0 || C > 32)
     return SDValue();
 
   SDLoc dl(N);
   bool isSigned = OpOpcode == ISD::SINT_TO_FP;
   SDValue ConvInput = Op.getOperand(0);
   if (IntBits < FloatBits)
     ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
                             dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
                             ConvInput);
 
   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
     Intrinsic::arm_neon_vcvtfxu2fp;
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
                      Op.getValueType(),
                      DAG.getConstant(IntrinsicOpcode, dl, MVT::i32),
                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
 }
 
 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift operation, where all the elements of the
 /// build_vector must have the same constant integer value.
 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
   // Ignore bit_converts.
   while (Op.getOpcode() == ISD::BITCAST)
     Op = Op.getOperand(0);
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
                                       HasAnyUndefs, ElementBits) ||
       SplatBitSize > ElementBits)
     return false;
   Cnt = SplatBits.getSExtValue();
   return true;
 }
 
 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift left operation.  That value must be in the range:
 ///   0 <= Value < ElementBits for a left shift; or
 ///   0 <= Value <= ElementBits for a long left shift.
 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
   int64_t ElementBits = VT.getScalarSizeInBits();
   if (! getVShiftImm(Op, ElementBits, Cnt))
     return false;
   return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
 }
 
 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift right operation.  For a shift opcode, the value
 /// is positive, but for an intrinsic the value count must be negative. The
 /// absolute value must be in the range:
 ///   1 <= |Value| <= ElementBits for a right shift; or
 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
                          int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
   int64_t ElementBits = VT.getScalarSizeInBits();
   if (! getVShiftImm(Op, ElementBits, Cnt))
     return false;
   if (!isIntrinsic)
     return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
   if (Cnt >= -(isNarrow ? ElementBits/2 : ElementBits) && Cnt <= -1) {
     Cnt = -Cnt;
     return true;
   }
   return false;
 }
 
 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
 static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   switch (IntNo) {
   default:
     // Don't do anything for most intrinsics.
     break;
 
   // Vector shifts: check for immediate versions and lower them.
   // Note: This is done during DAG combining instead of DAG legalizing because
   // the build_vectors for 64-bit vector element shift counts are generally
   // not legal, and it is hard to see their values after they get legalized to
   // loads from a constant pool.
   case Intrinsic::arm_neon_vshifts:
   case Intrinsic::arm_neon_vshiftu:
   case Intrinsic::arm_neon_vrshifts:
   case Intrinsic::arm_neon_vrshiftu:
   case Intrinsic::arm_neon_vrshiftn:
   case Intrinsic::arm_neon_vqshifts:
   case Intrinsic::arm_neon_vqshiftu:
   case Intrinsic::arm_neon_vqshiftsu:
   case Intrinsic::arm_neon_vqshiftns:
   case Intrinsic::arm_neon_vqshiftnu:
   case Intrinsic::arm_neon_vqshiftnsu:
   case Intrinsic::arm_neon_vqrshiftns:
   case Intrinsic::arm_neon_vqrshiftnu:
   case Intrinsic::arm_neon_vqrshiftnsu: {
     EVT VT = N->getOperand(1).getValueType();
     int64_t Cnt;
     unsigned VShiftOpc = 0;
 
     switch (IntNo) {
     case Intrinsic::arm_neon_vshifts:
     case Intrinsic::arm_neon_vshiftu:
       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
         VShiftOpc = ARMISD::VSHL;
         break;
       }
       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
         VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
                      ARMISD::VSHRs : ARMISD::VSHRu);
         break;
       }
       return SDValue();
 
     case Intrinsic::arm_neon_vrshifts:
     case Intrinsic::arm_neon_vrshiftu:
       if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
         break;
       return SDValue();
 
     case Intrinsic::arm_neon_vqshifts:
     case Intrinsic::arm_neon_vqshiftu:
       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
         break;
       return SDValue();
 
     case Intrinsic::arm_neon_vqshiftsu:
       if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
         break;
       llvm_unreachable("invalid shift count for vqshlu intrinsic");
 
     case Intrinsic::arm_neon_vrshiftn:
     case Intrinsic::arm_neon_vqshiftns:
     case Intrinsic::arm_neon_vqshiftnu:
     case Intrinsic::arm_neon_vqshiftnsu:
     case Intrinsic::arm_neon_vqrshiftns:
     case Intrinsic::arm_neon_vqrshiftnu:
     case Intrinsic::arm_neon_vqrshiftnsu:
       // Narrowing shifts require an immediate right shift.
       if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
         break;
       llvm_unreachable("invalid shift count for narrowing vector shift "
                        "intrinsic");
 
     default:
       llvm_unreachable("unhandled vector shift");
     }
 
     switch (IntNo) {
     case Intrinsic::arm_neon_vshifts:
     case Intrinsic::arm_neon_vshiftu:
       // Opcode already set above.
       break;
     case Intrinsic::arm_neon_vrshifts:
       VShiftOpc = ARMISD::VRSHRs; break;
     case Intrinsic::arm_neon_vrshiftu:
       VShiftOpc = ARMISD::VRSHRu; break;
     case Intrinsic::arm_neon_vrshiftn:
       VShiftOpc = ARMISD::VRSHRN; break;
     case Intrinsic::arm_neon_vqshifts:
       VShiftOpc = ARMISD::VQSHLs; break;
     case Intrinsic::arm_neon_vqshiftu:
       VShiftOpc = ARMISD::VQSHLu; break;
     case Intrinsic::arm_neon_vqshiftsu:
       VShiftOpc = ARMISD::VQSHLsu; break;
     case Intrinsic::arm_neon_vqshiftns:
       VShiftOpc = ARMISD::VQSHRNs; break;
     case Intrinsic::arm_neon_vqshiftnu:
       VShiftOpc = ARMISD::VQSHRNu; break;
     case Intrinsic::arm_neon_vqshiftnsu:
       VShiftOpc = ARMISD::VQSHRNsu; break;
     case Intrinsic::arm_neon_vqrshiftns:
       VShiftOpc = ARMISD::VQRSHRNs; break;
     case Intrinsic::arm_neon_vqrshiftnu:
       VShiftOpc = ARMISD::VQRSHRNu; break;
     case Intrinsic::arm_neon_vqrshiftnsu:
       VShiftOpc = ARMISD::VQRSHRNsu; break;
     }
 
     SDLoc dl(N);
     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
                        N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
   }
 
   case Intrinsic::arm_neon_vshiftins: {
     EVT VT = N->getOperand(1).getValueType();
     int64_t Cnt;
     unsigned VShiftOpc = 0;
 
     if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
       VShiftOpc = ARMISD::VSLI;
     else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
       VShiftOpc = ARMISD::VSRI;
     else {
       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
     }
 
     SDLoc dl(N);
     return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
                        N->getOperand(1), N->getOperand(2),
                        DAG.getConstant(Cnt, dl, MVT::i32));
   }
 
   case Intrinsic::arm_neon_vqrshifts:
   case Intrinsic::arm_neon_vqrshiftu:
     // No immediate versions of these to check for.
     break;
   }
 
   return SDValue();
 }
 
 /// PerformShiftCombine - Checks for immediate versions of vector shifts and
 /// lowers them.  As with the vector shift intrinsics, this is done during DAG
 /// combining instead of DAG legalizing because the build_vectors for 64-bit
 /// vector element shift counts are generally not legal, and it is hard to see
 /// their values after they get legalized to loads from a constant pool.
 static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
                                    const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
   if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) {
     // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high
     // 16-bits of x is zero. This optimizes rev + lsr 16 to rev16.
     SDValue N1 = N->getOperand(1);
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
       SDValue N0 = N->getOperand(0);
       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
           DAG.MaskedValueIsZero(N0.getOperand(0),
                                 APInt::getHighBitsSet(32, 16)))
         return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
     }
   }
 
   // Nothing to be done for scalar shifts.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!VT.isVector() || !TLI.isTypeLegal(VT))
     return SDValue();
 
   assert(ST->hasNEON() && "unexpected vector shift");
   int64_t Cnt;
 
   switch (N->getOpcode()) {
   default: llvm_unreachable("unexpected shift opcode");
 
   case ISD::SHL:
     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
       SDLoc dl(N);
       return DAG.getNode(ARMISD::VSHL, dl, VT, N->getOperand(0),
                          DAG.getConstant(Cnt, dl, MVT::i32));
     }
     break;
 
   case ISD::SRA:
   case ISD::SRL:
     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
       unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
                             ARMISD::VSHRs : ARMISD::VSHRu);
       SDLoc dl(N);
       return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
                          DAG.getConstant(Cnt, dl, MVT::i32));
     }
   }
   return SDValue();
 }
 
 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
 /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
 static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
                                     const ARMSubtarget *ST) {
   SDValue N0 = N->getOperand(0);
 
   // Check for sign- and zero-extensions of vector extract operations of 8-
   // and 16-bit vector elements.  NEON supports these directly.  They are
   // handled during DAG combining because type legalization will promote them
   // to 32-bit types and it is messy to recognize the operations after that.
   if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     SDValue Vec = N0.getOperand(0);
     SDValue Lane = N0.getOperand(1);
     EVT VT = N->getValueType(0);
     EVT EltVT = N0.getValueType();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
     if (VT == MVT::i32 &&
         (EltVT == MVT::i8 || EltVT == MVT::i16) &&
         TLI.isTypeLegal(Vec.getValueType()) &&
         isa<ConstantSDNode>(Lane)) {
 
       unsigned Opc = 0;
       switch (N->getOpcode()) {
       default: llvm_unreachable("unexpected opcode");
       case ISD::SIGN_EXTEND:
         Opc = ARMISD::VGETLANEs;
         break;
       case ISD::ZERO_EXTEND:
       case ISD::ANY_EXTEND:
         Opc = ARMISD::VGETLANEu;
         break;
       }
       return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
     }
   }
 
   return SDValue();
 }
 
 static const APInt *isPowerOf2Constant(SDValue V) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   if (!C)
     return nullptr;
   const APInt *CV = &C->getAPIntValue();
   return CV->isPowerOf2() ? CV : nullptr;
 }
 
 SDValue ARMTargetLowering::PerformCMOVToBFICombine(SDNode *CMOV, SelectionDAG &DAG) const {
   // If we have a CMOV, OR and AND combination such as:
   //   if (x & CN)
   //     y |= CM;
   //
   // And:
   //   * CN is a single bit;
   //   * All bits covered by CM are known zero in y
   //
   // Then we can convert this into a sequence of BFI instructions. This will
   // always be a win if CM is a single bit, will always be no worse than the
   // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
   // three bits (due to the extra IT instruction).
 
   SDValue Op0 = CMOV->getOperand(0);
   SDValue Op1 = CMOV->getOperand(1);
   auto CCNode = cast<ConstantSDNode>(CMOV->getOperand(2));
   auto CC = CCNode->getAPIntValue().getLimitedValue();
   SDValue CmpZ = CMOV->getOperand(4);
 
   // The compare must be against zero.
   if (!isNullConstant(CmpZ->getOperand(1)))
     return SDValue();
 
   assert(CmpZ->getOpcode() == ARMISD::CMPZ);
   SDValue And = CmpZ->getOperand(0);
   if (And->getOpcode() != ISD::AND)
     return SDValue();
   const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
   if (!AndC)
     return SDValue();
   SDValue X = And->getOperand(0);
 
   if (CC == ARMCC::EQ) {
     // We're performing an "equal to zero" compare. Swap the operands so we
     // canonicalize on a "not equal to zero" compare.
     std::swap(Op0, Op1);
   } else {
     assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
   }
 
   if (Op1->getOpcode() != ISD::OR)
     return SDValue();
 
   ConstantSDNode *OrC = dyn_cast<ConstantSDNode>(Op1->getOperand(1));
   if (!OrC)
     return SDValue();
   SDValue Y = Op1->getOperand(0);
 
   if (Op0 != Y)
     return SDValue();
 
   // Now, is it profitable to continue?
   APInt OrCI = OrC->getAPIntValue();
   unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
   if (OrCI.countPopulation() > Heuristic)
     return SDValue();
 
   // Lastly, can we determine that the bits defined by OrCI
   // are zero in Y?
   KnownBits Known;
   DAG.computeKnownBits(Y, Known);
   if ((OrCI & Known.Zero) != OrCI)
     return SDValue();
 
   // OK, we can do the combine.
   SDValue V = Y;
   SDLoc dl(X);
   EVT VT = X.getValueType();
   unsigned BitInX = AndC->logBase2();
 
   if (BitInX != 0) {
     // We must shift X first.
     X = DAG.getNode(ISD::SRL, dl, VT, X,
                     DAG.getConstant(BitInX, dl, VT));
   }
 
   for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
        BitInY < NumActiveBits; ++BitInY) {
     if (OrCI[BitInY] == 0)
       continue;
     APInt Mask(VT.getSizeInBits(), 0);
     Mask.setBit(BitInY);
     V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
                     // Confusingly, the operand is an *inverted* mask.
                     DAG.getConstant(~Mask, dl, VT));
   }
 
   return V;
 }
 
 /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
 SDValue
 ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const {
   SDValue Cmp = N->getOperand(4);
   if (Cmp.getOpcode() != ARMISD::CMPZ)
     // Only looking at NE cases.
     return SDValue();
 
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
   SDValue LHS = Cmp.getOperand(0);
   SDValue RHS = Cmp.getOperand(1);
   SDValue Chain = N->getOperand(0);
   SDValue BB = N->getOperand(1);
   SDValue ARMcc = N->getOperand(2);
   ARMCC::CondCodes CC =
     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
 
   // (brcond Chain BB ne CPSR (cmpz (and (cmov 0 1 CC CPSR Cmp) 1) 0))
   // -> (brcond Chain BB CC CPSR Cmp)
   if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
       LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
       LHS->getOperand(0)->hasOneUse()) {
     auto *LHS00C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(0));
     auto *LHS01C = dyn_cast<ConstantSDNode>(LHS->getOperand(0)->getOperand(1));
     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
     if ((LHS00C && LHS00C->getZExtValue() == 0) &&
         (LHS01C && LHS01C->getZExtValue() == 1) &&
         (LHS1C && LHS1C->getZExtValue() == 1) &&
         (RHSC && RHSC->getZExtValue() == 0)) {
       return DAG.getNode(
           ARMISD::BRCOND, dl, VT, Chain, BB, LHS->getOperand(0)->getOperand(2),
           LHS->getOperand(0)->getOperand(3), LHS->getOperand(0)->getOperand(4));
     }
   }
 
   return SDValue();
 }
 
 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
 SDValue
 ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   SDValue Cmp = N->getOperand(4);
   if (Cmp.getOpcode() != ARMISD::CMPZ)
     // Only looking at EQ and NE cases.
     return SDValue();
 
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
   SDValue LHS = Cmp.getOperand(0);
   SDValue RHS = Cmp.getOperand(1);
   SDValue FalseVal = N->getOperand(0);
   SDValue TrueVal = N->getOperand(1);
   SDValue ARMcc = N->getOperand(2);
   ARMCC::CondCodes CC =
     (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
 
   // BFI is only available on V6T2+.
   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
     SDValue R = PerformCMOVToBFICombine(N, DAG);
     if (R)
       return R;
   }
 
   // Simplify
   //   mov     r1, r0
   //   cmp     r1, x
   //   mov     r0, y
   //   moveq   r0, x
   // to
   //   cmp     r0, x
   //   movne   r0, y
   //
   //   mov     r1, r0
   //   cmp     r1, x
   //   mov     r0, x
   //   movne   r0, y
   // to
   //   cmp     r0, x
   //   movne   r0, y
   /// FIXME: Turn this into a target neutral optimization?
   SDValue Res;
   if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc,
                       N->getOperand(3), Cmp);
   } else if (CC == ARMCC::EQ && TrueVal == RHS) {
     SDValue ARMcc;
     SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
     Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc,
                       N->getOperand(3), NewCmp);
   }
 
   // (cmov F T ne CPSR (cmpz (cmov 0 1 CC CPSR Cmp) 0))
   // -> (cmov F T CC CPSR Cmp)
   if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse()) {
     auto *LHS0C = dyn_cast<ConstantSDNode>(LHS->getOperand(0));
     auto *LHS1C = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
     auto *RHSC = dyn_cast<ConstantSDNode>(RHS);
     if ((LHS0C && LHS0C->getZExtValue() == 0) &&
         (LHS1C && LHS1C->getZExtValue() == 1) &&
         (RHSC && RHSC->getZExtValue() == 0)) {
       return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
                          LHS->getOperand(2), LHS->getOperand(3),
                          LHS->getOperand(4));
     }
   }
 
   if (!VT.isInteger())
       return SDValue();
 
   // Materialize a boolean comparison for integers so we can avoid branching.
   if (isNullConstant(FalseVal)) {
     if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
       if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
         // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
         // right 5 bits will make that 32 be 1, otherwise it will be 0.
         // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
         SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
         Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
                           DAG.getConstant(5, dl, MVT::i32));
       } else {
         // CMOV 0, 1, ==, (CMPZ x, y) ->
         //     (ADDCARRY (SUB x, y), t:0, t:1)
         // where t = (SUBCARRY 0, (SUB x, y), 0)
         //
         // The SUBCARRY computes 0 - (x - y) and this will give a borrow when
         // x != y. In other words, a carry C == 1 when x == y, C == 0
         // otherwise.
         // The final ADDCARRY computes
         //     x - y + (0 - (x - y)) + C == C
         SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
         SDVTList VTs = DAG.getVTList(VT, MVT::i32);
         SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
         // ISD::SUBCARRY returns a borrow but we want the carry here
         // actually.
         SDValue Carry =
             DAG.getNode(ISD::SUB, dl, MVT::i32,
                         DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
         Res = DAG.getNode(ISD::ADDCARRY, dl, VTs, Sub, Neg, Carry);
       }
     } else if (CC == ARMCC::NE && LHS != RHS &&
                (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
       // This seems pointless but will allow us to combine it further below.
       // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
       SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
                         N->getOperand(3), Cmp);
     }
   } else if (isNullConstant(TrueVal)) {
     if (CC == ARMCC::EQ && LHS != RHS &&
         (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
       // This seems pointless but will allow us to combine it further below
       // Note that we change == for != as this is the dual for the case above.
       // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUB x, y), z, !=, (CMPZ x, y)
       SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
       Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
                         DAG.getConstant(ARMCC::NE, dl, MVT::i32),
                         N->getOperand(3), Cmp);
     }
   }
 
   // On Thumb1, the DAG above may be further combined if z is a power of 2
   // (z == 2 ^ K).
   // CMOV (SUB x, y), z, !=, (CMPZ x, y) ->
   //       merge t3, t4
   // where t1 = (SUBCARRY (SUB x, y), z, 0)
   //       t2 = (SUBCARRY (SUB x, y), t1:0, t1:1)
   //       t3 = if K != 0 then (SHL t2:0, K) else t2:0
   //       t4 = (SUB 1, t2:1)   [ we want a carry, not a borrow ]
   const APInt *TrueConst;
   if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
       (FalseVal.getOpcode() == ISD::SUB) && (FalseVal.getOperand(0) == LHS) &&
       (FalseVal.getOperand(1) == RHS) &&
       (TrueConst = isPowerOf2Constant(TrueVal))) {
     SDVTList VTs = DAG.getVTList(VT, MVT::i32);
     unsigned ShiftAmount = TrueConst->logBase2();
     if (ShiftAmount)
       TrueVal = DAG.getConstant(1, dl, VT);
     SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
     Res = DAG.getNode(ISD::SUBCARRY, dl, VTs, FalseVal, Subc, Subc.getValue(1));
     // Make it a carry, not a borrow.
     SDValue Carry = DAG.getNode(
         ISD::SUB, dl, VT, DAG.getConstant(1, dl, MVT::i32), Res.getValue(1));
     Res = DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Res, Carry);
 
     if (ShiftAmount)
       Res = DAG.getNode(ISD::SHL, dl, VT, Res,
                         DAG.getConstant(ShiftAmount, dl, MVT::i32));
   }
 
   if (Res.getNode()) {
     KnownBits Known;
     DAG.computeKnownBits(SDValue(N,0), Known);
     // Capture demanded bits information that would be otherwise lost.
     if (Known.Zero == 0xfffffffe)
       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
                         DAG.getValueType(MVT::i1));
     else if (Known.Zero == 0xffffff00)
       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
                         DAG.getValueType(MVT::i8));
     else if (Known.Zero == 0xffff0000)
       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
                         DAG.getValueType(MVT::i16));
   }
 
   return Res;
 }
 
 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
   case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
   case ARMISD::ADDC:
   case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI, Subtarget);
   case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI, Subtarget);
   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
   case ISD::STORE:      return PerformSTORECombine(N, DCI);
   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
   case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
   case ARMISD::VDUP: return PerformVDUPCombine(N, DCI);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
   case ISD::FDIV:
     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:        return PerformShiftCombine(N, DCI.DAG, Subtarget);
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
   case ARMISD::VLD1DUP:
   case ARMISD::VLD2DUP:
   case ARMISD::VLD3DUP:
   case ARMISD::VLD4DUP:
     return PerformVLDCombine(N, DCI);
   case ARMISD::BUILD_VECTOR:
     return PerformARMBUILD_VECTORCombine(N, DCI);
   case ARMISD::SMULWB: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
       return SDValue();
     break;
   }
   case ARMISD::SMULWT: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
     if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
       return SDValue();
     break;
   }
   case ARMISD::SMLALBB: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
       return SDValue();
     break;
   }
   case ARMISD::SMLALBT: {
     unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
     APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
     unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
     APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
     if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
         (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
       return SDValue();
     break;
   }
   case ARMISD::SMLALTB: {
     unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
     APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
     unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
     APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
     if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
         (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
       return SDValue();
     break;
   }
   case ARMISD::SMLALTT: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
     if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
         (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
       return SDValue();
     break;
   }
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
     case Intrinsic::arm_neon_vld1:
     case Intrinsic::arm_neon_vld1x2:
     case Intrinsic::arm_neon_vld1x3:
     case Intrinsic::arm_neon_vld1x4:
     case Intrinsic::arm_neon_vld2:
     case Intrinsic::arm_neon_vld3:
     case Intrinsic::arm_neon_vld4:
     case Intrinsic::arm_neon_vld2lane:
     case Intrinsic::arm_neon_vld3lane:
     case Intrinsic::arm_neon_vld4lane:
     case Intrinsic::arm_neon_vld2dup:
     case Intrinsic::arm_neon_vld3dup:
     case Intrinsic::arm_neon_vld4dup:
     case Intrinsic::arm_neon_vst1:
     case Intrinsic::arm_neon_vst1x2:
     case Intrinsic::arm_neon_vst1x3:
     case Intrinsic::arm_neon_vst1x4:
     case Intrinsic::arm_neon_vst2:
     case Intrinsic::arm_neon_vst3:
     case Intrinsic::arm_neon_vst4:
     case Intrinsic::arm_neon_vst2lane:
     case Intrinsic::arm_neon_vst3lane:
     case Intrinsic::arm_neon_vst4lane:
       return PerformVLDCombine(N, DCI);
     default: break;
     }
     break;
   }
   return SDValue();
 }
 
 bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
                                                           EVT VT) const {
   return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
 }
 
 bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                        unsigned,
                                                        unsigned,
                                                        bool *Fast) const {
   // Depends what it gets converted into if the type is weird.
   if (!VT.isSimple())
     return false;
 
   // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
   bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
 
   switch (VT.getSimpleVT().SimpleTy) {
   default:
     return false;
   case MVT::i8:
   case MVT::i16:
   case MVT::i32: {
     // Unaligned access can use (for example) LRDB, LRDH, LDR
     if (AllowsUnaligned) {
       if (Fast)
         *Fast = Subtarget->hasV7Ops();
       return true;
     }
     return false;
   }
   case MVT::f64:
   case MVT::v2f64: {
     // For any little-endian targets with neon, we can support unaligned ld/st
     // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
     // A big-endian target may also explicitly support unaligned accesses
     if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
       if (Fast)
         *Fast = true;
       return true;
     }
     return false;
   }
   }
 }
 
 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
                        unsigned AlignCheck) {
   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
           (DstAlign == 0 || DstAlign % AlignCheck == 0));
 }
 
 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
                                            unsigned DstAlign, unsigned SrcAlign,
                                            bool IsMemset, bool ZeroMemset,
                                            bool MemcpyStrSrc,
                                            MachineFunction &MF) const {
   const Function &F = MF.getFunction();
 
   // See if we can use NEON instructions for this...
   if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
       !F.hasFnAttribute(Attribute::NoImplicitFloat)) {
     bool Fast;
     if (Size >= 16 &&
         (memOpAlign(SrcAlign, DstAlign, 16) ||
          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1, &Fast) && Fast))) {
       return MVT::v2f64;
     } else if (Size >= 8 &&
                (memOpAlign(SrcAlign, DstAlign, 8) ||
                 (allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
                  Fast))) {
       return MVT::f64;
     }
   }
 
   // Let the target-independent logic figure it out.
   return MVT::Other;
 }
 
 // 64-bit integers are split into their high and low parts and held in two
 // different registers, so the trunc is free since the low register can just
 // be used.
 bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
   if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
     return false;
   unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
   unsigned DestBits = DstTy->getPrimitiveSizeInBits();
   return (SrcBits == 64 && DestBits == 32);
 }
 
 bool ARMTargetLowering::isTruncateFree(EVT SrcVT, EVT DstVT) const {
   if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
       !DstVT.isInteger())
     return false;
   unsigned SrcBits = SrcVT.getSizeInBits();
   unsigned DestBits = DstVT.getSizeInBits();
   return (SrcBits == 64 && DestBits == 32);
 }
 
 bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   if (Val.getOpcode() != ISD::LOAD)
     return false;
 
   EVT VT1 = Val.getValueType();
   if (!VT1.isSimple() || !VT1.isInteger() ||
       !VT2.isSimple() || !VT2.isInteger())
     return false;
 
   switch (VT1.getSimpleVT().SimpleTy) {
   default: break;
   case MVT::i1:
   case MVT::i8:
   case MVT::i16:
     // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
     return true;
   }
 
   return false;
 }
 
 bool ARMTargetLowering::isFNegFree(EVT VT) const {
   if (!VT.isSimple())
     return false;
 
   // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
   // negate values directly (fneg is free). So, we don't want to let the DAG
   // combiner rewrite fneg into xors and some other instructions.  For f16 and
   // FullFP16 argument passing, some bitcast nodes may be introduced,
   // triggering this DAG combine rewrite, so we are avoiding that with this.
   switch (VT.getSimpleVT().SimpleTy) {
   default: break;
   case MVT::f16:
     return Subtarget->hasFullFP16();
   }
 
   return false;
 }
 
 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   EVT VT = ExtVal.getValueType();
 
   if (!isTypeLegal(VT))
     return false;
 
   // Don't create a loadext if we can fold the extension into a wide/long
   // instruction.
   // If there's more than one user instruction, the loadext is desirable no
   // matter what.  There can be two uses by the same instruction.
   if (ExtVal->use_empty() ||
       !ExtVal->use_begin()->isOnlyUserOf(ExtVal.getNode()))
     return true;
 
   SDNode *U = *ExtVal->use_begin();
   if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
        U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHL))
     return false;
 
   return true;
 }
 
 bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
 
   if (!isTypeLegal(EVT::getEVT(Ty1)))
     return false;
 
   assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
 
   // Assuming the caller doesn't have a zeroext or signext return parameter,
   // truncation all the way down to i1 is valid.
   return true;
 }
 
 int ARMTargetLowering::getScalingFactorCost(const DataLayout &DL,
                                                 const AddrMode &AM, Type *Ty,
                                                 unsigned AS) const {
   if (isLegalAddressingMode(DL, AM, Ty, AS)) {
     if (Subtarget->hasFPAO())
       return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
     return 0;
   }
   return -1;
 }
 
 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
   if (V < 0)
     return false;
 
   unsigned Scale = 1;
   switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
   case MVT::i8:
     // Scale == 1;
     break;
   case MVT::i16:
     // Scale == 2;
     Scale = 2;
     break;
   case MVT::i32:
     // Scale == 4;
     Scale = 4;
     break;
   }
 
   if ((V & (Scale - 1)) != 0)
     return false;
   V /= Scale;
   return V == (V & ((1LL << 5) - 1));
 }
 
 static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
                                       const ARMSubtarget *Subtarget) {
   bool isNeg = false;
   if (V < 0) {
     isNeg = true;
     V = - V;
   }
 
   switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
     // + imm12 or - imm8
     if (isNeg)
       return V == (V & ((1LL << 8) - 1));
     return V == (V & ((1LL << 12) - 1));
   case MVT::f32:
   case MVT::f64:
     // Same as ARM mode. FIXME: NEON?
     if (!Subtarget->hasVFP2())
       return false;
     if ((V & 3) != 0)
       return false;
     V >>= 2;
     return V == (V & ((1LL << 8) - 1));
   }
 }
 
 /// isLegalAddressImmediate - Return true if the integer value can be used
 /// as the offset of the target addressing mode for load / store of the
 /// given type.
 static bool isLegalAddressImmediate(int64_t V, EVT VT,
                                     const ARMSubtarget *Subtarget) {
   if (V == 0)
     return true;
 
   if (!VT.isSimple())
     return false;
 
   if (Subtarget->isThumb1Only())
     return isLegalT1AddressImmediate(V, VT);
   else if (Subtarget->isThumb2())
     return isLegalT2AddressImmediate(V, VT, Subtarget);
 
   // ARM mode.
   if (V < 0)
     V = - V;
   switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
   case MVT::i8:
   case MVT::i32:
     // +- imm12
     return V == (V & ((1LL << 12) - 1));
   case MVT::i16:
     // +- imm8
     return V == (V & ((1LL << 8) - 1));
   case MVT::f32:
   case MVT::f64:
     if (!Subtarget->hasVFP2()) // FIXME: NEON?
       return false;
     if ((V & 3) != 0)
       return false;
     V >>= 2;
     return V == (V & ((1LL << 8) - 1));
   }
 }
 
 bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
                                                       EVT VT) const {
   int Scale = AM.Scale;
   if (Scale < 0)
     return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
     if (Scale == 1)
       return true;
     // r + r << imm
     Scale = Scale & ~1;
     return Scale == 2 || Scale == 4 || Scale == 8;
   case MVT::i64:
     // FIXME: What are we trying to model here? ldrd doesn't have an r + r
     // version in Thumb mode.
     // r + r
     if (Scale == 1)
       return true;
     // r * 2 (this can be lowered to r + r).
     if (!AM.HasBaseReg && Scale == 2)
       return true;
     return false;
   case MVT::isVoid:
     // Note, we allow "void" uses (basically, uses that aren't loads or
     // stores), because arm allows folding a scale into many arithmetic
     // operations.  This should be made more precise and revisited later.
 
     // Allow r << imm, but the imm has to be a multiple of two.
     if (Scale & 1) return false;
     return isPowerOf2_32(Scale);
   }
 }
 
 bool ARMTargetLowering::isLegalT1ScaledAddressingMode(const AddrMode &AM,
                                                       EVT VT) const {
   const int Scale = AM.Scale;
 
   // Negative scales are not supported in Thumb1.
   if (Scale < 0)
     return false;
 
   // Thumb1 addressing modes do not support register scaling excepting the
   // following cases:
   // 1. Scale == 1 means no scaling.
   // 2. Scale == 2 this can be lowered to r + r if there is no base register.
   return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
 }
 
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool ARMTargetLowering::isLegalAddressingMode(const DataLayout &DL,
                                               const AddrMode &AM, Type *Ty,
                                               unsigned AS, Instruction *I) const {
   EVT VT = getValueType(DL, Ty, true);
   if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
     return false;
 
   // Can never fold addr of global into load/store.
   if (AM.BaseGV)
     return false;
 
   switch (AM.Scale) {
   case 0:  // no scale reg, must be "r+i" or "r", or "i".
     break;
   default:
     // ARM doesn't support any R+R*scale+imm addr modes.
     if (AM.BaseOffs)
       return false;
 
     if (!VT.isSimple())
       return false;
 
     if (Subtarget->isThumb1Only())
       return isLegalT1ScaledAddressingMode(AM, VT);
 
     if (Subtarget->isThumb2())
       return isLegalT2ScaledAddressingMode(AM, VT);
 
     int Scale = AM.Scale;
     switch (VT.getSimpleVT().SimpleTy) {
     default: return false;
     case MVT::i1:
     case MVT::i8:
     case MVT::i32:
       if (Scale < 0) Scale = -Scale;
       if (Scale == 1)
         return true;
       // r + r << imm
       return isPowerOf2_32(Scale & ~1);
     case MVT::i16:
     case MVT::i64:
       // r +/- r
       if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
         return true;
       // r * 2 (this can be lowered to r + r).
       if (!AM.HasBaseReg && Scale == 2)
         return true;
       return false;
 
     case MVT::isVoid:
       // Note, we allow "void" uses (basically, uses that aren't loads or
       // stores), because arm allows folding a scale into many arithmetic
       // operations.  This should be made more precise and revisited later.
 
       // Allow r << imm, but the imm has to be a multiple of two.
       if (Scale & 1) return false;
       return isPowerOf2_32(Scale);
     }
   }
   return true;
 }
 
 /// isLegalICmpImmediate - Return true if the specified immediate is legal
 /// icmp immediate, that is the target has icmp instructions which can compare
 /// a register against the immediate without having to materialize the
 /// immediate into a register.
 bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   // Thumb2 and ARM modes can use cmn for negative immediates.
   if (!Subtarget->isThumb())
     return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
            ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
   if (Subtarget->isThumb2())
     return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
            ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
   // Thumb1 doesn't have cmn, and only 8-bit immediates.
   return Imm >= 0 && Imm <= 255;
 }
 
 /// isLegalAddImmediate - Return true if the specified immediate is a legal add
 /// *or sub* immediate, that is the target has add or sub instructions which can
 /// add a register with the immediate without having to materialize the
 /// immediate into a register.
 bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
   // Same encoding for add/sub, just flip the sign.
   int64_t AbsImm = std::abs(Imm);
   if (!Subtarget->isThumb())
     return ARM_AM::getSOImmVal(AbsImm) != -1;
   if (Subtarget->isThumb2())
     return ARM_AM::getT2SOImmVal(AbsImm) != -1;
   // Thumb1 only has 8-bit unsigned immediate.
   return AbsImm >= 0 && AbsImm <= 255;
 }
 
 static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
                                       bool isSEXTLoad, SDValue &Base,
                                       SDValue &Offset, bool &isInc,
                                       SelectionDAG &DAG) {
   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
     return false;
 
   if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
     // AddressingMode 3
     Base = Ptr->getOperand(0);
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
       int RHSC = (int)RHS->getZExtValue();
       if (RHSC < 0 && RHSC > -256) {
         assert(Ptr->getOpcode() == ISD::ADD);
         isInc = false;
         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
         return true;
       }
     }
     isInc = (Ptr->getOpcode() == ISD::ADD);
     Offset = Ptr->getOperand(1);
     return true;
   } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
     // AddressingMode 2
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
       int RHSC = (int)RHS->getZExtValue();
       if (RHSC < 0 && RHSC > -0x1000) {
         assert(Ptr->getOpcode() == ISD::ADD);
         isInc = false;
         Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
         Base = Ptr->getOperand(0);
         return true;
       }
     }
 
     if (Ptr->getOpcode() == ISD::ADD) {
       isInc = true;
       ARM_AM::ShiftOpc ShOpcVal=
         ARM_AM::getShiftOpcForNode(Ptr->getOperand(0).getOpcode());
       if (ShOpcVal != ARM_AM::no_shift) {
         Base = Ptr->getOperand(1);
         Offset = Ptr->getOperand(0);
       } else {
         Base = Ptr->getOperand(0);
         Offset = Ptr->getOperand(1);
       }
       return true;
     }
 
     isInc = (Ptr->getOpcode() == ISD::ADD);
     Base = Ptr->getOperand(0);
     Offset = Ptr->getOperand(1);
     return true;
   }
 
   // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
   return false;
 }
 
 static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
                                      bool isSEXTLoad, SDValue &Base,
                                      SDValue &Offset, bool &isInc,
                                      SelectionDAG &DAG) {
   if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
     return false;
 
   Base = Ptr->getOperand(0);
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
     int RHSC = (int)RHS->getZExtValue();
     if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
       assert(Ptr->getOpcode() == ISD::ADD);
       isInc = false;
       Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
       return true;
     } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
       isInc = Ptr->getOpcode() == ISD::ADD;
       Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
       return true;
     }
   }
 
   return false;
 }
 
 /// getPreIndexedAddressParts - returns true by value, base pointer and
 /// offset pointer and addressing mode by reference if the node's address
 /// can be legally represented as pre-indexed load / store address.
 bool
 ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
                                              SDValue &Offset,
                                              ISD::MemIndexedMode &AM,
                                              SelectionDAG &DAG) const {
   if (Subtarget->isThumb1Only())
     return false;
 
   EVT VT;
   SDValue Ptr;
   bool isSEXTLoad = false;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
     VT  = LD->getMemoryVT();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
     VT  = ST->getMemoryVT();
   } else
     return false;
 
   bool isInc;
   bool isLegal = false;
   if (Subtarget->isThumb2())
     isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
                                        Offset, isInc, DAG);
   else
     isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
                                         Offset, isInc, DAG);
   if (!isLegal)
     return false;
 
   AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
   return true;
 }
 
 /// getPostIndexedAddressParts - returns true by value, base pointer and
 /// offset pointer and addressing mode by reference if this node can be
 /// combined with a load / store to form a post-indexed load / store.
 bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                    SDValue &Base,
                                                    SDValue &Offset,
                                                    ISD::MemIndexedMode &AM,
                                                    SelectionDAG &DAG) const {
   EVT VT;
   SDValue Ptr;
   bool isSEXTLoad = false, isNonExt;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT  = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT  = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
     isNonExt = !ST->isTruncatingStore();
   } else
     return false;
 
   if (Subtarget->isThumb1Only()) {
     // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
     // must be non-extending/truncating, i32, with an offset of 4.
     assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
     if (Op->getOpcode() != ISD::ADD || !isNonExt)
       return false;
     auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
     if (!RHS || RHS->getZExtValue() != 4)
       return false;
 
     Offset = Op->getOperand(1);
     Base = Op->getOperand(0);
     AM = ISD::POST_INC;
     return true;
   }
 
   bool isInc;
   bool isLegal = false;
   if (Subtarget->isThumb2())
     isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
                                        isInc, DAG);
   else
     isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
                                         isInc, DAG);
   if (!isLegal)
     return false;
 
   if (Ptr != Base) {
     // Swap base ptr and offset to catch more post-index load / store when
     // it's legal. In Thumb2 mode, offset must be an immediate.
     if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
         !Subtarget->isThumb2())
       std::swap(Base, Offset);
 
     // Post-indexed load / store update the base pointer.
     if (Ptr != Base)
       return false;
   }
 
   AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
   return true;
 }
 
 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       KnownBits &Known,
                                                       const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   unsigned BitWidth = Known.getBitWidth();
   Known.resetAll();
   switch (Op.getOpcode()) {
   default: break;
   case ARMISD::ADDC:
   case ARMISD::ADDE:
   case ARMISD::SUBC:
   case ARMISD::SUBE:
     // Special cases when we convert a carry to a boolean.
     if (Op.getResNo() == 0) {
       SDValue LHS = Op.getOperand(0);
       SDValue RHS = Op.getOperand(1);
       // (ADDE 0, 0, C) will give us a single bit.
       if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
           isNullConstant(RHS)) {
         Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
         return;
       }
     }
     break;
   case ARMISD::CMOV: {
     // Bits are known zero/one if known on the LHS and RHS.
     DAG.computeKnownBits(Op.getOperand(0), Known, Depth+1);
     if (Known.isUnknown())
       return;
 
     KnownBits KnownRHS;
     DAG.computeKnownBits(Op.getOperand(1), KnownRHS, Depth+1);
     Known.Zero &= KnownRHS.Zero;
     Known.One  &= KnownRHS.One;
     return;
   }
   case ISD::INTRINSIC_W_CHAIN: {
     ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
     switch (IntID) {
     default: return;
     case Intrinsic::arm_ldaex:
     case Intrinsic::arm_ldrex: {
       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
       unsigned MemBits = VT.getScalarSizeInBits();
       Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
       return;
     }
     }
   }
   case ARMISD::BFI: {
     // Conservatively, we can recurse down the first operand
     // and just mask out all affected bits.
     DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
 
     // The operand to BFI is already a mask suitable for removing the bits it
     // sets.
     ConstantSDNode *CI = cast<ConstantSDNode>(Op.getOperand(2));
     const APInt &Mask = CI->getAPIntValue();
     Known.Zero &= Mask;
     Known.One &= Mask;
     return;
   }
   }
 }
 
 //===----------------------------------------------------------------------===//
 //                           ARM Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
 bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
   // Looking for "rev" which is V6+.
   if (!Subtarget->hasV6Ops())
     return false;
 
   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
   std::string AsmStr = IA->getAsmString();
   SmallVector<StringRef, 4> AsmPieces;
   SplitString(AsmStr, AsmPieces, ";\n");
 
   switch (AsmPieces.size()) {
   default: return false;
   case 1:
     AsmStr = AsmPieces[0];
     AsmPieces.clear();
     SplitString(AsmStr, AsmPieces, " \t,");
 
     // rev $0, $1
     if (AsmPieces.size() == 3 &&
         AsmPieces[0] == "rev" && AsmPieces[1] == "$0" && AsmPieces[2] == "$1" &&
         IA->getConstraintString().compare(0, 4, "=l,l") == 0) {
       IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
       if (Ty && Ty->getBitWidth() == 32)
         return IntrinsicLowering::LowerToByteSwap(CI);
     }
     break;
   }
 
   return false;
 }
 
 const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
   // At this point, we have to lower this constraint to something else, so we
   // lower it to an "r" or "w". However, by doing this we will force the result
   // to be in register, while the X constraint is much more permissive.
   //
   // Although we are correct (we are free to emit anything, without
   // constraints), we might break use cases that would expect us to be more
   // efficient and emit something else.
   if (!Subtarget->hasVFP2())
     return "r";
   if (ConstraintVT.isFloatingPoint())
     return "w";
   if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
      (ConstraintVT.getSizeInBits() == 64 ||
       ConstraintVT.getSizeInBits() == 128))
     return "w";
 
   return "r";
 }
 
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 ARMTargetLowering::ConstraintType
 ARMTargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:  break;
     case 'l': return C_RegisterClass;
     case 'w': return C_RegisterClass;
     case 'h': return C_RegisterClass;
     case 'x': return C_RegisterClass;
     case 't': return C_RegisterClass;
     case 'j': return C_Other; // Constant for movw.
       // An address with a single base register. Due to the way we
       // currently handle addresses it is the same as an 'r' memory constraint.
     case 'Q': return C_Memory;
     }
   } else if (Constraint.size() == 2) {
     switch (Constraint[0]) {
     default: break;
     // All 'U+' constraints are addresses.
     case 'U': return C_Memory;
     }
   }
   return TargetLowering::getConstraintType(Constraint);
 }
 
 /// Examine constraint type and operand type and determine a weight value.
 /// This object must already have been set up with the operand type
 /// and the current alternative constraint selected.
 TargetLowering::ConstraintWeight
 ARMTargetLowering::getSingleConstraintMatchWeight(
     AsmOperandInfo &info, const char *constraint) const {
   ConstraintWeight weight = CW_Invalid;
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
   if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
     break;
   case 'l':
     if (type->isIntegerTy()) {
       if (Subtarget->isThumb())
         weight = CW_SpecificReg;
       else
         weight = CW_Register;
     }
     break;
   case 'w':
     if (type->isFloatingPointTy())
       weight = CW_Register;
     break;
   }
   return weight;
 }
 
 using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
 
 RCPair ARMTargetLowering::getRegForInlineAsmConstraint(
     const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC ARM Constraint Letters
     switch (Constraint[0]) {
     case 'l': // Low regs or general regs.
       if (Subtarget->isThumb())
         return RCPair(0U, &ARM::tGPRRegClass);
       return RCPair(0U, &ARM::GPRRegClass);
     case 'h': // High regs or no regs.
       if (Subtarget->isThumb())
         return RCPair(0U, &ARM::hGPRRegClass);
       break;
     case 'r':
       if (Subtarget->isThumb1Only())
         return RCPair(0U, &ARM::tGPRRegClass);
       return RCPair(0U, &ARM::GPRRegClass);
     case 'w':
       if (VT == MVT::Other)
         break;
       if (VT == MVT::f32)
         return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
         return RCPair(0U, &ARM::DPRRegClass);
       if (VT.getSizeInBits() == 128)
         return RCPair(0U, &ARM::QPRRegClass);
       break;
     case 'x':
       if (VT == MVT::Other)
         break;
       if (VT == MVT::f32)
         return RCPair(0U, &ARM::SPR_8RegClass);
       if (VT.getSizeInBits() == 64)
         return RCPair(0U, &ARM::DPR_8RegClass);
       if (VT.getSizeInBits() == 128)
         return RCPair(0U, &ARM::QPR_8RegClass);
       break;
     case 't':
       if (VT == MVT::Other)
         break;
       if (VT == MVT::f32 || VT == MVT::i32)
         return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
         return RCPair(0U, &ARM::DPR_VFP2RegClass);
       if (VT.getSizeInBits() == 128)
         return RCPair(0U, &ARM::QPR_VFP2RegClass);
       break;
     }
   }
   if (StringRef("{cc}").equals_lower(Constraint))
     return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
 
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
   SDValue Result;
 
   // Currently only support length 1 constraints.
   if (Constraint.length() != 1) return;
 
   char ConstraintLetter = Constraint[0];
   switch (ConstraintLetter) {
   default: break;
   case 'j':
   case 'I': case 'J': case 'K': case 'L':
   case 'M': case 'N': case 'O':
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
     if (!C)
       return;
 
     int64_t CVal64 = C->getSExtValue();
     int CVal = (int) CVal64;
     // None of these constraints allow values larger than 32 bits.  Check
     // that the value fits in an int.
     if (CVal != CVal64)
       return;
 
     switch (ConstraintLetter) {
       case 'j':
         // Constant suitable for movw, must be between 0 and
         // 65535.
         if (Subtarget->hasV6T2Ops())
           if (CVal >= 0 && CVal <= 65535)
             break;
         return;
       case 'I':
         if (Subtarget->isThumb1Only()) {
           // This must be a constant between 0 and 255, for ADD
           // immediates.
           if (CVal >= 0 && CVal <= 255)
             break;
         } else if (Subtarget->isThumb2()) {
           // A constant that can be used as an immediate value in a
           // data-processing instruction.
           if (ARM_AM::getT2SOImmVal(CVal) != -1)
             break;
         } else {
           // A constant that can be used as an immediate value in a
           // data-processing instruction.
           if (ARM_AM::getSOImmVal(CVal) != -1)
             break;
         }
         return;
 
       case 'J':
         if (Subtarget->isThumb1Only()) {
           // This must be a constant between -255 and -1, for negated ADD
           // immediates. This can be used in GCC with an "n" modifier that
           // prints the negated value, for use with SUB instructions. It is
           // not useful otherwise but is implemented for compatibility.
           if (CVal >= -255 && CVal <= -1)
             break;
         } else {
           // This must be a constant between -4095 and 4095. It is not clear
           // what this constraint is intended for. Implemented for
           // compatibility with GCC.
           if (CVal >= -4095 && CVal <= 4095)
             break;
         }
         return;
 
       case 'K':
         if (Subtarget->isThumb1Only()) {
           // A 32-bit value where only one byte has a nonzero value. Exclude
           // zero to match GCC. This constraint is used by GCC internally for
           // constants that can be loaded with a move/shift combination.
           // It is not useful otherwise but is implemented for compatibility.
           if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
             break;
         } else if (Subtarget->isThumb2()) {
           // A constant whose bitwise inverse can be used as an immediate
           // value in a data-processing instruction. This can be used in GCC
           // with a "B" modifier that prints the inverted value, for use with
           // BIC and MVN instructions. It is not useful otherwise but is
           // implemented for compatibility.
           if (ARM_AM::getT2SOImmVal(~CVal) != -1)
             break;
         } else {
           // A constant whose bitwise inverse can be used as an immediate
           // value in a data-processing instruction. This can be used in GCC
           // with a "B" modifier that prints the inverted value, for use with
           // BIC and MVN instructions. It is not useful otherwise but is
           // implemented for compatibility.
           if (ARM_AM::getSOImmVal(~CVal) != -1)
             break;
         }
         return;
 
       case 'L':
         if (Subtarget->isThumb1Only()) {
           // This must be a constant between -7 and 7,
           // for 3-operand ADD/SUB immediate instructions.
           if (CVal >= -7 && CVal < 7)
             break;
         } else if (Subtarget->isThumb2()) {
           // A constant whose negation can be used as an immediate value in a
           // data-processing instruction. This can be used in GCC with an "n"
           // modifier that prints the negated value, for use with SUB
           // instructions. It is not useful otherwise but is implemented for
           // compatibility.
           if (ARM_AM::getT2SOImmVal(-CVal) != -1)
             break;
         } else {
           // A constant whose negation can be used as an immediate value in a
           // data-processing instruction. This can be used in GCC with an "n"
           // modifier that prints the negated value, for use with SUB
           // instructions. It is not useful otherwise but is implemented for
           // compatibility.
           if (ARM_AM::getSOImmVal(-CVal) != -1)
             break;
         }
         return;
 
       case 'M':
         if (Subtarget->isThumb1Only()) {
           // This must be a multiple of 4 between 0 and 1020, for
           // ADD sp + immediate.
           if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
             break;
         } else {
           // A power of two or a constant between 0 and 32.  This is used in
           // GCC for the shift amount on shifted register operands, but it is
           // useful in general for any shift amounts.
           if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
             break;
         }
         return;
 
       case 'N':
         if (Subtarget->isThumb()) {  // FIXME thumb2
           // This must be a constant between 0 and 31, for shift amounts.
           if (CVal >= 0 && CVal <= 31)
             break;
         }
         return;
 
       case 'O':
         if (Subtarget->isThumb()) {  // FIXME thumb2
           // This must be a multiple of 4 between -508 and 508, for
           // ADD/SUB sp = sp + immediate.
           if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
             break;
         }
         return;
     }
     Result = DAG.getTargetConstant(CVal, SDLoc(Op), Op.getValueType());
     break;
   }
 
   if (Result.getNode()) {
     Ops.push_back(Result);
     return;
   }
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 static RTLIB::Libcall getDivRemLibcall(
     const SDNode *N, MVT::SimpleValueType SVT) {
   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
          "Unhandled Opcode in getDivRemLibcall");
   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
                   N->getOpcode() == ISD::SREM;
   RTLIB::Libcall LC;
   switch (SVT) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:  LC = isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
   case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
   case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
   }
   return LC;
 }
 
 static TargetLowering::ArgListTy getDivRemArgList(
     const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
   assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
           N->getOpcode() == ISD::SREM    || N->getOpcode() == ISD::UREM) &&
          "Unhandled Opcode in getDivRemArgList");
   bool isSigned = N->getOpcode() == ISD::SDIVREM ||
                   N->getOpcode() == ISD::SREM;
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     EVT ArgVT = N->getOperand(i).getValueType();
     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
     Entry.Node = N->getOperand(i);
     Entry.Ty = ArgTy;
     Entry.IsSExt = isSigned;
     Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   if (Subtarget->isTargetWindows() && Args.size() >= 2)
     std::swap(Args[0], Args[1]);
   return Args;
 }
 
 SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
           Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
           Subtarget->isTargetWindows()) &&
          "Register-based DivRem lowering only");
   unsigned Opcode = Op->getOpcode();
   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
          "Invalid opcode for Div/Rem lowering");
   bool isSigned = (Opcode == ISD::SDIVREM);
   EVT VT = Op->getValueType(0);
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
   SDLoc dl(Op);
 
   // If the target has hardware divide, use divide + multiply + subtract:
   //     div = a / b
   //     rem = a - b * div
   //     return {div, rem}
   // This should be lowered into UDIV/SDIV + MLS later on.
   bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
                                         : Subtarget->hasDivideInARMMode();
   if (hasDivide && Op->getValueType(0).isSimple() &&
       Op->getSimpleValueType(0) == MVT::i32) {
     unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
     const SDValue Dividend = Op->getOperand(0);
     const SDValue Divisor = Op->getOperand(1);
     SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
     SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
     SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
 
     SDValue Values[2] = {Div, Rem};
     return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
   }
 
   RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
                                        VT.getSimpleVT().SimpleTy);
   SDValue InChain = DAG.getEntryNode();
 
   TargetLowering::ArgListTy Args = getDivRemArgList(Op.getNode(),
                                                     DAG.getContext(),
                                                     Subtarget);
 
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = StructType::get(Ty, Ty);
 
   if (Subtarget->isTargetWindows())
     InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(InChain)
     .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
     .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return CallInfo.first;
 }
 
 // Lowers REM using divmod helpers
 // see RTABI section 4.2/4.3
 SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
   // Build return types (div and rem)
   std::vector<Type*> RetTyParams;
   Type *RetTyElement;
 
   switch (N->getValueType(0).getSimpleVT().SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   RetTyElement = Type::getInt8Ty(*DAG.getContext());  break;
   case MVT::i16:  RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
   case MVT::i32:  RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
   case MVT::i64:  RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
   }
 
   RetTyParams.push_back(RetTyElement);
   RetTyParams.push_back(RetTyElement);
   ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
   Type *RetTy = StructType::get(*DAG.getContext(), ret);
 
   RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
                                                              SimpleTy);
   SDValue InChain = DAG.getEntryNode();
   TargetLowering::ArgListTy Args = getDivRemArgList(N, DAG.getContext(),
                                                     Subtarget);
   bool isSigned = N->getOpcode() == ISD::SREM;
   SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
                                          getPointerTy(DAG.getDataLayout()));
 
   if (Subtarget->isTargetWindows())
     InChain = WinDBZCheckDenominator(DAG, N, InChain);
 
   // Lower call
   CallLoweringInfo CLI(DAG);
   CLI.setChain(InChain)
      .setCallee(CallingConv::ARM_AAPCS, RetTy, Callee, std::move(Args))
      .setSExtResult(isSigned).setZExtResult(!isSigned).setDebugLoc(SDLoc(N));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   // Return second (rem) result operand (first contains div)
   SDNode *ResNode = CallResult.first.getNode();
   assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
   return ResNode->getOperand(1);
 }
 
 SDValue
 ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->isTargetWindows() && "unsupported target platform");
   SDLoc DL(Op);
 
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
 
   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
           "no-stack-arg-probe")) {
     unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
     SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
     Chain = SP.getValue(1);
     SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
     if (Align)
       SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
                        DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
     Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
     SDValue Ops[2] = { SP, Chain };
     return DAG.getMergeValues(Ops, DL);
   }
 
   SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
                               DAG.getConstant(2, DL, MVT::i32));
 
   SDValue Flag;
   Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Flag);
   Flag = Chain.getValue(1);
 
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Flag);
 
   SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
   Chain = NewSP.getValue(1);
 
   SDValue Ops[2] = { NewSP, Chain };
   return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType() == MVT::f64 && Subtarget->isFPOnlySP() &&
          "Unexpected type for custom-lowering FP_EXTEND");
 
   RTLIB::Libcall LC;
   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
 
   SDValue SrcVal = Op.getOperand(0);
   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
                      SDLoc(Op)).first;
 }
 
 SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getOperand(0).getValueType() == MVT::f64 &&
          Subtarget->isFPOnlySP() &&
          "Unexpected type for custom-lowering FP_ROUND");
 
   RTLIB::Libcall LC;
   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
 
   SDValue SrcVal = Op.getOperand(0);
   return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
                      SDLoc(Op)).first;
 }
 
 bool
 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The ARM target isn't yet aware of offsets.
   return false;
 }
 
 bool ARM::isBitFieldInvertedMask(unsigned v) {
   if (v == 0xffffffff)
     return false;
 
   // there can be 1's on either or both "outsides", all the "inside"
   // bits must be 0's
   return isShiftedMask_32(~v);
 }
 
 /// isFPImmLegal - Returns true if the target can instruction select the
 /// specified FP immediate natively. If false, the legalizer will
 /// materialize the FP immediate as a load from a constant pool.
 bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   if (!Subtarget->hasVFP3())
     return false;
   if (VT == MVT::f16 && Subtarget->hasFullFP16())
     return ARM_AM::getFP16Imm(Imm) != -1;
   if (VT == MVT::f32)
     return ARM_AM::getFP32Imm(Imm) != -1;
   if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
     return ARM_AM::getFP64Imm(Imm) != -1;
   return false;
 }
 
 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
 /// specified in the intrinsic calls.
 bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                            const CallInst &I,
                                            MachineFunction &MF,
                                            unsigned Intrinsic) const {
   switch (Intrinsic) {
   case Intrinsic::arm_neon_vld1:
   case Intrinsic::arm_neon_vld2:
   case Intrinsic::arm_neon_vld3:
   case Intrinsic::arm_neon_vld4:
   case Intrinsic::arm_neon_vld2lane:
   case Intrinsic::arm_neon_vld3lane:
   case Intrinsic::arm_neon_vld4lane:
   case Intrinsic::arm_neon_vld2dup:
   case Intrinsic::arm_neon_vld3dup:
   case Intrinsic::arm_neon_vld4dup: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
     // volatile loads with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
   case Intrinsic::arm_neon_vld1x2:
   case Intrinsic::arm_neon_vld1x3:
   case Intrinsic::arm_neon_vld1x4: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
     Info.align = 0;
     // volatile loads with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
   case Intrinsic::arm_neon_vst1:
   case Intrinsic::arm_neon_vst2:
   case Intrinsic::arm_neon_vst3:
   case Intrinsic::arm_neon_vst4:
   case Intrinsic::arm_neon_vst2lane:
   case Intrinsic::arm_neon_vst3lane:
   case Intrinsic::arm_neon_vst4lane: {
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     unsigned NumElts = 0;
     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
     // volatile stores with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
   case Intrinsic::arm_neon_vst1x2:
   case Intrinsic::arm_neon_vst1x3:
   case Intrinsic::arm_neon_vst1x4: {
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     unsigned NumElts = 0;
     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
       NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.align = 0;
     // volatile stores with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
   case Intrinsic::arm_ldaex:
   case Intrinsic::arm_ldrex: {
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::arm_stlex:
   case Intrinsic::arm_strex: {
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
     Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
   case Intrinsic::arm_stlexd:
   case Intrinsic::arm_strexd:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
     Info.ptrVal = I.getArgOperand(2);
     Info.offset = 0;
     Info.align = 8;
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
 
   case Intrinsic::arm_ldaexd:
   case Intrinsic::arm_ldrexd:
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.align = 8;
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
 
   default:
     break;
   }
 
   return false;
 }
 
 /// Returns true if it is beneficial to convert a load of a constant
 /// to just the constant itself.
 bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                           Type *Ty) const {
   assert(Ty->isIntegerTy());
 
   unsigned Bits = Ty->getPrimitiveSizeInBits();
   if (Bits == 0 || Bits > 32)
     return false;
   return true;
 }
 
 bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                                 unsigned Index) const {
   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
     return false;
 
   return (Index == 0 || Index == ResVT.getVectorNumElements());
 }
 
 Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder,
                                         ARM_MB::MemBOpt Domain) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
 
   // First, if the target has no DMB, see what fallback we can use.
   if (!Subtarget->hasDataBarrier()) {
     // Some ARMv6 cpus can support data barriers with an mcr instruction.
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
     // here.
     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
       Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr);
       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
                         Builder.getInt32(0), Builder.getInt32(7),
                         Builder.getInt32(10), Builder.getInt32(5)};
       return Builder.CreateCall(MCR, args);
     } else {
       // Instead of using barriers, atomic accesses on these subtargets use
       // libcalls.
       llvm_unreachable("makeDMB on a target so old that it has no barriers");
     }
   } else {
     Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb);
     // Only a full system barrier exists in the M-class architectures.
     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
     Constant *CDomain = Builder.getInt32(Domain);
     return Builder.CreateCall(DMB, CDomain);
   }
 }
 
 // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
 Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder,
                                                  Instruction *Inst,
                                                  AtomicOrdering Ord) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
     llvm_unreachable("Invalid fence: unordered/non-atomic");
   case AtomicOrdering::Monotonic:
   case AtomicOrdering::Acquire:
     return nullptr; // Nothing to do
   case AtomicOrdering::SequentiallyConsistent:
     if (!Inst->hasAtomicStore())
       return nullptr; // Nothing to do
     LLVM_FALLTHROUGH;
   case AtomicOrdering::Release:
   case AtomicOrdering::AcquireRelease:
     if (Subtarget->preferISHSTBarriers())
       return makeDMB(Builder, ARM_MB::ISHST);
     // FIXME: add a comment with a link to documentation justifying this.
     else
       return makeDMB(Builder, ARM_MB::ISH);
   }
   llvm_unreachable("Unknown fence ordering in emitLeadingFence");
 }
 
 Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder,
                                                   Instruction *Inst,
                                                   AtomicOrdering Ord) const {
   switch (Ord) {
   case AtomicOrdering::NotAtomic:
   case AtomicOrdering::Unordered:
     llvm_unreachable("Invalid fence: unordered/not-atomic");
   case AtomicOrdering::Monotonic:
   case AtomicOrdering::Release:
     return nullptr; // Nothing to do
   case AtomicOrdering::Acquire:
   case AtomicOrdering::AcquireRelease:
   case AtomicOrdering::SequentiallyConsistent:
     return makeDMB(Builder, ARM_MB::ISH);
   }
   llvm_unreachable("Unknown fence ordering in emitTrailingFence");
 }
 
 // Loads and stores less than 64-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
 // anything for those.
 bool ARMTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
   return (Size == 64) && !Subtarget->isMClass();
 }
 
 // Loads and stores less than 64-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
 // anything for those.
 // FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
 // guarantee, see DDI0406C ARM architecture reference manual,
 // sections A8.8.72-74 LDRD)
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
   return ((Size == 64) && !Subtarget->isMClass()) ? AtomicExpansionKind::LLOnly
                                                   : AtomicExpansionKind::None;
 }
 
 // For the real atomic operations, we have ldrex/strex up to 32 bits,
 // and up to 64 bits on the non-M profiles
 TargetLowering::AtomicExpansionKind
 ARMTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
   bool hasAtomicRMW = !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
   return (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW)
              ? AtomicExpansionKind::LLSC
              : AtomicExpansionKind::None;
 }
 
 bool ARMTargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
   // At -O0, fast-regalloc cannot cope with the live vregs necessary to
   // implement cmpxchg without spilling. If the address being exchanged is also
   // on the stack and close enough to the spill slot, this can lead to a
   // situation where the monitor always gets cleared and the atomic operation
   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
   bool hasAtomicCmpXchg =
       !Subtarget->isThumb() || Subtarget->hasV8MBaselineOps();
   return getTargetMachine().getOptLevel() != 0 && hasAtomicCmpXchg;
 }
 
 bool ARMTargetLowering::shouldInsertFencesForAtomic(
     const Instruction *I) const {
   return InsertFencesForAtomic;
 }
 
 // This has so far only been implemented for MachO.
 bool ARMTargetLowering::useLoadStackGuardNode() const {
   return Subtarget->isTargetMachO();
 }
 
 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                                   unsigned &Cost) const {
   // If we do not have NEON, vector types are not natively supported.
   if (!Subtarget->hasNEON())
     return false;
 
   // Floating point values and vector values map to the same register file.
   // Therefore, although we could do a store extract of a vector type, this is
   // better to leave at float as we have more freedom in the addressing mode for
   // those.
   if (VectorTy->isFPOrFPVectorTy())
     return false;
 
   // If the index is unknown at compile time, this is very expensive to lower
   // and it is not possible to combine the store with the extract.
   if (!isa<ConstantInt>(Idx))
     return false;
 
   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
   unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
   // We can do a store + vector extract on any vector that fits perfectly in a D
   // or Q register.
   if (BitWidth == 64 || BitWidth == 128) {
     Cost = 0;
     return true;
   }
   return false;
 }
 
 bool ARMTargetLowering::isCheapToSpeculateCttz() const {
   return Subtarget->hasV6T2Ops();
 }
 
 bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
   return Subtarget->hasV6T2Ops();
 }
 
 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                          AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
   bool IsAcquire = isAcquireOrStronger(Ord);
 
   // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i32, i32} and we have to recombine them into a
   // single i64 here.
   if (ValTy->getPrimitiveSizeInBits() == 64) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
     Function *Ldrex = Intrinsic::getDeclaration(M, Int);
 
     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
     Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
 
     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
     if (!Subtarget->isLittle())
       std::swap (Lo, Hi);
     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
     return Builder.CreateOr(
         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
   }
 
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
   Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys);
 
   return Builder.CreateTruncOrBitCast(
       Builder.CreateCall(Ldrex, Addr),
       cast<PointerType>(Addr->getType())->getElementType());
 }
 
 void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilder<> &Builder) const {
   if (!Subtarget->hasV7Ops())
     return;
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex));
 }
 
 Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                                                Value *Addr,
                                                AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   bool IsRelease = isReleaseOrStronger(Ord);
 
   // Since the intrinsics must have legal type, the i64 intrinsics take two
   // parameters: "i32, i32". We must marshal Val into the appropriate form
   // before the call.
   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
     Intrinsic::ID Int =
         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
     Function *Strex = Intrinsic::getDeclaration(M, Int);
     Type *Int32Ty = Type::getInt32Ty(M->getContext());
 
     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
     if (!Subtarget->isLittle())
       std::swap(Lo, Hi);
     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
     return Builder.CreateCall(Strex, {Lo, Hi, Addr});
   }
 
   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
   Type *Tys[] = { Addr->getType() };
   Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
 
   return Builder.CreateCall(
       Strex, {Builder.CreateZExtOrBitCast(
                   Val, Strex->getFunctionType()->getParamType(0)),
               Addr});
 }
 
 /// A helper function for determining the number of interleaved accesses we
 /// will generate when lowering accesses of the given type.
 unsigned
 ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
                                              const DataLayout &DL) const {
   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
 }
 
 bool ARMTargetLowering::isLegalInterleavedAccessType(
     VectorType *VecTy, const DataLayout &DL) const {
 
   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
 
   // Ensure the vector doesn't have f16 elements. Even though we could do an
   // i16 vldN, we can't hold the f16 vectors and will end up converting via
   // f32.
   if (VecTy->getElementType()->isHalfTy())
     return false;
 
   // Ensure the number of vector elements is greater than 1.
   if (VecTy->getNumElements() < 2)
     return false;
 
   // Ensure the element type is legal.
   if (ElSize != 8 && ElSize != 16 && ElSize != 32)
     return false;
 
   // Ensure the total vector size is 64 or a multiple of 128. Types larger than
   // 128 will be split into multiple interleaved accesses.
   return VecSize == 64 || VecSize % 128 == 0;
 }
 
 /// Lower an interleaved load into a vldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
 ///        %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6>  ; Extract even elements
 ///        %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7>  ; Extract odd elements
 ///
 ///      Into:
 ///        %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
 ///        %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
 ///        %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
 bool ARMTargetLowering::lowerInterleavedLoad(
     LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
     ArrayRef<unsigned> Indices, unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
   assert(!Shuffles.empty() && "Empty shufflevector input");
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
   VectorType *VecTy = Shuffles[0]->getType();
   Type *EltTy = VecTy->getVectorElementType();
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
     return false;
 
   unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
 
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   if (EltTy->isPointerTy())
     VecTy =
         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
   IRBuilder<> Builder(LI);
 
   // The base address of the load.
   Value *BaseAddr = LI->getPointerOperand();
 
   if (NumLoads > 1) {
     // If we're going to generate more than one load, reset the sub-vector type
     // to something legal.
     VecTy = VectorType::get(VecTy->getVectorElementType(),
                             VecTy->getVectorNumElements() / NumLoads);
 
     // We will compute the pointer operand of each load from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
         BaseAddr, VecTy->getVectorElementType()->getPointerTo(
                       LI->getPointerAddressSpace()));
   }
 
   assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
 
   Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
   Type *Tys[] = {VecTy, Int8Ptr};
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
                                             Intrinsic::arm_neon_vld3,
                                             Intrinsic::arm_neon_vld4};
   Function *VldnFunc =
       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
 
   // Holds sub-vectors extracted from the load intrinsic return values. The
   // sub-vectors are associated with the shufflevector instructions they will
   // replace.
   DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
 
   for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
     // If we're generating more than one load, compute the base address of
     // subsequent loads as an offset from the previous.
     if (LoadCount > 0)
       BaseAddr = Builder.CreateConstGEP1_32(
           BaseAddr, VecTy->getVectorNumElements() * Factor);
 
     SmallVector<Value *, 2> Ops;
     Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
     Ops.push_back(Builder.getInt32(LI->getAlignment()));
 
     CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
 
     // Replace uses of each shufflevector with the corresponding vector loaded
     // by ldN.
     for (unsigned i = 0; i < Shuffles.size(); i++) {
       ShuffleVectorInst *SV = Shuffles[i];
       unsigned Index = Indices[i];
 
       Value *SubVec = Builder.CreateExtractValue(VldN, Index);
 
       // Convert the integer vector to pointer vector if the element is pointer.
       if (EltTy->isPointerTy())
         SubVec = Builder.CreateIntToPtr(
             SubVec, VectorType::get(SV->getType()->getVectorElementType(),
                                     VecTy->getVectorNumElements()));
 
       SubVecs[SV].push_back(SubVec);
     }
   }
 
   // Replace uses of the shufflevector instructions with the sub-vectors
   // returned by the load intrinsic. If a shufflevector instruction is
   // associated with more than one sub-vector, those sub-vectors will be
   // concatenated into a single wide vector.
   for (ShuffleVectorInst *SVI : Shuffles) {
     auto &SubVec = SubVecs[SVI];
     auto *WideVec =
         SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
     SVI->replaceAllUsesWith(WideVec);
   }
 
   return true;
 }
 
 /// Lower an interleaved store into a vstN intrinsic.
 ///
 /// E.g. Lower an interleaved store (Factor = 3):
 ///        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
 ///                                  <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
 ///
 ///      Into:
 ///        %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
 ///        %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
 ///        %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
 ///
 /// Note that the new shufflevectors will be removed and we'll only generate one
 /// vst3 instruction in CodeGen.
 ///
 /// Example for a more general valid mask (Factor 3). Lower:
 ///        %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
 ///                 <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
 ///        store <12 x i32> %i.vec, <12 x i32>* %ptr
 ///
 ///      Into:
 ///        %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
 ///        %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
 ///        %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
 ///        call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
 bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
                                               ShuffleVectorInst *SVI,
                                               unsigned Factor) const {
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
   VectorType *VecTy = SVI->getType();
   assert(VecTy->getVectorNumElements() % Factor == 0 &&
          "Invalid interleaved store");
 
   unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
   Type *EltTy = VecTy->getVectorElementType();
   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
 
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
   if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
     return false;
 
   unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
 
   Value *Op0 = SVI->getOperand(0);
   Value *Op1 = SVI->getOperand(1);
   IRBuilder<> Builder(SI);
 
   // StN intrinsics don't support pointer vectors as arguments. Convert pointer
   // vectors to integer vectors.
   if (EltTy->isPointerTy()) {
     Type *IntTy = DL.getIntPtrType(EltTy);
 
     // Convert to the corresponding integer vector.
     Type *IntVecTy =
         VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
 
     SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
   // The base address of the store.
   Value *BaseAddr = SI->getPointerOperand();
 
   if (NumStores > 1) {
     // If we're going to generate more than one store, reset the lane length
     // and sub-vector type to something legal.
     LaneLen /= NumStores;
     SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
 
     // We will compute the pointer operand of each store from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
         BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
                       SI->getPointerAddressSpace()));
   }
 
   assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
 
   auto Mask = SVI->getShuffleMask();
 
   Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
   Type *Tys[] = {Int8Ptr, SubVecTy};
   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
                                              Intrinsic::arm_neon_vst3,
                                              Intrinsic::arm_neon_vst4};
 
   for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
     // If we generating more than one store, we compute the base address of
     // subsequent stores as an offset from the previous.
     if (StoreCount > 0)
       BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
 
     SmallVector<Value *, 6> Ops;
     Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
 
     Function *VstNFunc =
         Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
 
     // Split the shufflevector operands into sub vectors for the new vstN call.
     for (unsigned i = 0; i < Factor; i++) {
       unsigned IdxI = StoreCount * LaneLen * Factor + i;
       if (Mask[IdxI] >= 0) {
         Ops.push_back(Builder.CreateShuffleVector(
             Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
       } else {
         unsigned StartMask = 0;
         for (unsigned j = 1; j < LaneLen; j++) {
           unsigned IdxJ = StoreCount * LaneLen * Factor + j;
           if (Mask[IdxJ * Factor + IdxI] >= 0) {
             StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
             break;
           }
         }
         // Note: If all elements in a chunk are undefs, StartMask=0!
         // Note: Filling undef gaps with random elements is ok, since
         // those elements were being written anyway (with undefs).
         // In the case of all undefs we're defaulting to using elems from 0
         // Note: StartMask cannot be negative, it's checked in
         // isReInterleaveMask
         Ops.push_back(Builder.CreateShuffleVector(
             Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
       }
     }
 
     Ops.push_back(Builder.getInt32(SI->getAlignment()));
     Builder.CreateCall(VstNFunc, Ops);
   }
   return true;
 }
 
 enum HABaseType {
   HA_UNKNOWN = 0,
   HA_FLOAT,
   HA_DOUBLE,
   HA_VECT64,
   HA_VECT128
 };
 
 static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
                                    uint64_t &Members) {
   if (auto *ST = dyn_cast<StructType>(Ty)) {
     for (unsigned i = 0; i < ST->getNumElements(); ++i) {
       uint64_t SubMembers = 0;
       if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
         return false;
       Members += SubMembers;
     }
   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
     uint64_t SubMembers = 0;
     if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
       return false;
     Members += SubMembers * AT->getNumElements();
   } else if (Ty->isFloatTy()) {
     if (Base != HA_UNKNOWN && Base != HA_FLOAT)
       return false;
     Members = 1;
     Base = HA_FLOAT;
   } else if (Ty->isDoubleTy()) {
     if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
       return false;
     Members = 1;
     Base = HA_DOUBLE;
   } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
     Members = 1;
     switch (Base) {
     case HA_FLOAT:
     case HA_DOUBLE:
       return false;
     case HA_VECT64:
       return VT->getBitWidth() == 64;
     case HA_VECT128:
       return VT->getBitWidth() == 128;
     case HA_UNKNOWN:
       switch (VT->getBitWidth()) {
       case 64:
         Base = HA_VECT64;
         return true;
       case 128:
         Base = HA_VECT128;
         return true;
       default:
         return false;
       }
     }
   }
 
   return (Members > 0 && Members <= 4);
 }
 
 /// Return the correct alignment for the current calling convention.
 unsigned
 ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
                                                  DataLayout DL) const {
   if (!ArgTy->isVectorTy())
     return DL.getABITypeAlignment(ArgTy);
 
   // Avoid over-aligning vector parameters. It would require realigning the
   // stack and waste space for no real benefit.
   return std::min(DL.getABITypeAlignment(ArgTy), DL.getStackAlignment());
 }
 
 /// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
 /// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
 /// passing according to AAPCS rules.
 bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
   if (getEffectiveCallingConv(CallConv, isVarArg) !=
       CallingConv::ARM_AAPCS_VFP)
     return false;
 
   HABaseType Base = HA_UNKNOWN;
   uint64_t Members = 0;
   bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
   LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
 
   bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
   return IsHA || IsIntArray;
 }
 
 unsigned ARMTargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   // Platforms which do not use SjLj EH may return values in these registers
   // via the personality function.
   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
 }
 
 unsigned ARMTargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   // Platforms which do not use SjLj EH may return values in these registers
   // via the personality function.
   return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
 }
 
 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
   // Update IsSplitCSR in ARMFunctionInfo.
   ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
   AFI->setIsSplitCSR(true);
 }
 
 void ARMTargetLowering::insertCopiesSplitCSR(
     MachineBasicBlock *Entry,
     const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
   if (!IStart)
     return;
 
   const TargetInstrInfo *TII = Subtarget->getInstrInfo();
   MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
   MachineBasicBlock::iterator MBBI = Entry->begin();
   for (const MCPhysReg *I = IStart; *I; ++I) {
     const TargetRegisterClass *RC = nullptr;
     if (ARM::GPRRegClass.contains(*I))
       RC = &ARM::GPRRegClass;
     else if (ARM::DPRRegClass.contains(*I))
       RC = &ARM::DPRRegClass;
     else
       llvm_unreachable("Unexpected register class in CSRsViaCopy!");
 
     unsigned NewVR = MRI->createVirtualRegister(RC);
     // Create copy from CSR to a virtual register.
     // FIXME: this currently does not emit CFI pseudo-instructions, it works
     // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
     // nounwind. If we want to generalize this later, we may need to emit
     // CFI pseudo-instructions.
     assert(Entry->getParent()->getFunction().hasFnAttribute(
                Attribute::NoUnwind) &&
            "Function should be nounwind in insertCopiesSplitCSR!");
     Entry->addLiveIn(*I);
     BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
         .addReg(*I);
 
     // Insert the copy-back instructions right before the terminator.
     for (auto *Exit : Exits)
       BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
               TII->get(TargetOpcode::COPY), *I)
           .addReg(NewVR);
   }
 }
 
 void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
   MF.getFrameInfo().computeMaxCallFrameSize(MF);
   TargetLoweringBase::finalizeLowering(MF);
 }
Index: head/contrib/llvm/lib/Target/ARM/ARMISelLowering.h
===================================================================
--- head/contrib/llvm/lib/Target/ARM/ARMISelLowering.h	(revision 344055)
+++ head/contrib/llvm/lib/Target/ARM/ARMISelLowering.h	(revision 344056)
@@ -1,809 +1,812 @@
 //===- ARMISelLowering.h - ARM DAG Lowering Interface -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the interfaces that ARM uses to lower LLVM code into a
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
 #define LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H
 
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/MachineValueType.h"
 #include <utility>
 
 namespace llvm {
 
 class ARMSubtarget;
 class DataLayout;
 class FastISel;
 class FunctionLoweringInfo;
 class GlobalValue;
 class InstrItineraryData;
 class Instruction;
 class MachineBasicBlock;
 class MachineInstr;
 class SelectionDAG;
 class TargetLibraryInfo;
 class TargetMachine;
 class TargetRegisterInfo;
 class VectorType;
 
   namespace ARMISD {
 
     // ARM Specific DAG Nodes
     enum NodeType : unsigned {
       // Start the numbering where the builtin ops and target ops leave off.
       FIRST_NUMBER = ISD::BUILTIN_OP_END,
 
       Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
                     // TargetExternalSymbol, and TargetGlobalAddress.
       WrapperPIC,   // WrapperPIC - A wrapper node for TargetGlobalAddress in
                     // PIC mode.
       WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
 
       // Add pseudo op to model memcpy for struct byval.
       COPY_STRUCT_BYVAL,
 
       CALL,         // Function call.
       CALL_PRED,    // Function call that's predicable.
       CALL_NOLINK,  // Function call with branch not branch-and-link.
       BRCOND,       // Conditional branch.
       BR_JT,        // Jumptable branch.
       BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
       RET_FLAG,     // Return with a flag operand.
       INTRET_FLAG,  // Interrupt return with an LR-offset and a flag operand.
 
       PIC_ADD,      // Add with a PC operand and a PIC label.
 
       CMP,          // ARM compare instructions.
       CMN,          // ARM CMN instructions.
       CMPZ,         // ARM compare that sets only Z flag.
       CMPFP,        // ARM VFP compare instruction, sets FPSCR.
       CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
       FMSTAT,       // ARM fmstat instruction.
 
       CMOV,         // ARM conditional move instructions.
 
       SSAT,         // Signed saturation
       USAT,         // Unsigned saturation
 
       BCC_i64,
 
       SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
       SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
       RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
 
       ADDC,         // Add with carry
       ADDE,         // Add using carry
       SUBC,         // Sub with carry
       SUBE,         // Sub using carry
 
       VMOVRRD,      // double to two gprs.
       VMOVDRR,      // Two gprs to double.
       VMOVSR,       // move gpr to single, used for f32 literal constructed in a gpr
 
       EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
       EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
       EH_SJLJ_SETUP_DISPATCH, // SjLj exception handling setup_dispatch.
 
       TC_RETURN,    // Tail call return pseudo.
 
       THREAD_POINTER,
 
       DYN_ALLOC,    // Dynamic allocation on the stack.
 
       MEMBARRIER_MCR, // Memory barrier (MCR)
 
       PRELOAD,      // Preload
 
       WIN__CHKSTK,  // Windows' __chkstk call to do stack probing.
       WIN__DBZCHK,  // Windows' divide by zero check
 
       VCEQ,         // Vector compare equal.
       VCEQZ,        // Vector compare equal to zero.
       VCGE,         // Vector compare greater than or equal.
       VCGEZ,        // Vector compare greater than or equal to zero.
       VCLEZ,        // Vector compare less than or equal to zero.
       VCGEU,        // Vector compare unsigned greater than or equal.
       VCGT,         // Vector compare greater than.
       VCGTZ,        // Vector compare greater than zero.
       VCLTZ,        // Vector compare less than zero.
       VCGTU,        // Vector compare unsigned greater than.
       VTST,         // Vector test bits.
 
       // Vector shift by immediate:
       VSHL,         // ...left
       VSHRs,        // ...right (signed)
       VSHRu,        // ...right (unsigned)
 
       // Vector rounding shift by immediate:
       VRSHRs,       // ...right (signed)
       VRSHRu,       // ...right (unsigned)
       VRSHRN,       // ...right narrow
 
       // Vector saturating shift by immediate:
       VQSHLs,       // ...left (signed)
       VQSHLu,       // ...left (unsigned)
       VQSHLsu,      // ...left (signed to unsigned)
       VQSHRNs,      // ...right narrow (signed)
       VQSHRNu,      // ...right narrow (unsigned)
       VQSHRNsu,     // ...right narrow (signed to unsigned)
 
       // Vector saturating rounding shift by immediate:
       VQRSHRNs,     // ...right narrow (signed)
       VQRSHRNu,     // ...right narrow (unsigned)
       VQRSHRNsu,    // ...right narrow (signed to unsigned)
 
       // Vector shift and insert:
       VSLI,         // ...left
       VSRI,         // ...right
 
       // Vector get lane (VMOV scalar to ARM core register)
       // (These are used for 8- and 16-bit element types only.)
       VGETLANEu,    // zero-extend vector extract element
       VGETLANEs,    // sign-extend vector extract element
 
       // Vector move immediate and move negated immediate:
       VMOVIMM,
       VMVNIMM,
 
       // Vector move f32 immediate:
       VMOVFPIMM,
 
       // Move H <-> R, clearing top 16 bits
       VMOVrh,
       VMOVhr,
 
       // Vector duplicate:
       VDUP,
       VDUPLANE,
 
       // Vector shuffles:
       VEXT,         // extract
       VREV64,       // reverse elements within 64-bit doublewords
       VREV32,       // reverse elements within 32-bit words
       VREV16,       // reverse elements within 16-bit halfwords
       VZIP,         // zip (interleave)
       VUZP,         // unzip (deinterleave)
       VTRN,         // transpose
       VTBL1,        // 1-register shuffle with mask
       VTBL2,        // 2-register shuffle with mask
 
       // Vector multiply long:
       VMULLs,       // ...signed
       VMULLu,       // ...unsigned
 
       SMULWB,       // Signed multiply word by half word, bottom
       SMULWT,       // Signed multiply word by half word, top
       UMLAL,        // 64bit Unsigned Accumulate Multiply
       SMLAL,        // 64bit Signed Accumulate Multiply
       UMAAL,        // 64-bit Unsigned Accumulate Accumulate Multiply
       SMLALBB,      // 64-bit signed accumulate multiply bottom, bottom 16
       SMLALBT,      // 64-bit signed accumulate multiply bottom, top 16
       SMLALTB,      // 64-bit signed accumulate multiply top, bottom 16
       SMLALTT,      // 64-bit signed accumulate multiply top, top 16
       SMLALD,       // Signed multiply accumulate long dual
       SMLALDX,      // Signed multiply accumulate long dual exchange
       SMLSLD,       // Signed multiply subtract long dual
       SMLSLDX,      // Signed multiply subtract long dual exchange
       SMMLAR,       // Signed multiply long, round and add
       SMMLSR,       // Signed multiply long, subtract and round
 
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
       // operations, but for ARM some BUILD_VECTORs are legal as-is and their
       // operands need to be legalized.  Define an ARM-specific version of
       // BUILD_VECTOR for this purpose.
       BUILD_VECTOR,
 
       // Bit-field insert
       BFI,
 
       // Vector OR with immediate
       VORRIMM,
       // Vector AND with NOT of immediate
       VBICIMM,
 
       // Vector bitwise select
       VBSL,
 
       // Pseudo-instruction representing a memory copy using ldm/stm
       // instructions.
       MEMCPY,
 
       // Vector load N-element structure to all lanes:
       VLD1DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
       VLD2DUP,
       VLD3DUP,
       VLD4DUP,
 
       // NEON loads with post-increment base updates:
       VLD1_UPD,
       VLD2_UPD,
       VLD3_UPD,
       VLD4_UPD,
       VLD2LN_UPD,
       VLD3LN_UPD,
       VLD4LN_UPD,
       VLD1DUP_UPD,
       VLD2DUP_UPD,
       VLD3DUP_UPD,
       VLD4DUP_UPD,
 
       // NEON stores with post-increment base updates:
       VST1_UPD,
       VST2_UPD,
       VST3_UPD,
       VST4_UPD,
       VST2LN_UPD,
       VST3LN_UPD,
       VST4LN_UPD
     };
 
   } // end namespace ARMISD
 
   /// Define some predicates that are used for node matching.
   namespace ARM {
 
     bool isBitFieldInvertedMask(unsigned v);
 
   } // end namespace ARM
 
   //===--------------------------------------------------------------------===//
   //  ARMTargetLowering - ARM Implementation of the TargetLowering interface
 
   class ARMTargetLowering : public TargetLowering {
   public:
     explicit ARMTargetLowering(const TargetMachine &TM,
                                const ARMSubtarget &STI);
 
     unsigned getJumpTableEncoding() const override;
     bool useSoftFloat() const override;
 
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// ReplaceNodeResults - Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
                             SelectionDAG &DAG) const override;
 
     const char *getTargetNodeName(unsigned Opcode) const override;
 
     bool isSelectSupported(SelectSupportKind Kind) const override {
       // ARM does not support scalar condition selects on vectors.
       return (Kind != ScalarCondVectorVal);
     }
 
     bool isReadOnly(const GlobalValue *GV) const;
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
 
     MachineBasicBlock *
     EmitInstrWithCustomInserter(MachineInstr &MI,
                                 MachineBasicBlock *MBB) const override;
 
     void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                        SDNode *Node) const override;
 
     SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
     bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;
 
     /// allowsMisalignedMemoryAccesses - Returns true if the target allows
     /// unaligned memory accesses of the specified type. Returns whether it
     /// is "fast" by reference in the second argument.
     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
                                         unsigned Align,
                                         bool *Fast) const override;
 
     EVT getOptimalMemOpType(uint64_t Size,
                             unsigned DstAlign, unsigned SrcAlign,
                             bool IsMemset, bool ZeroMemset,
                             bool MemcpyStrSrc,
                             MachineFunction &MF) const override;
 
     bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
     bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
     bool isFNegFree(EVT VT) const override;
 
     bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
 
     bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
     bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
                                Type *Ty, unsigned AS,
                                Instruction *I = nullptr) const override;
 
     /// getScalingFactorCost - Return the cost of the scaling used in
     /// addressing mode represented by AM.
     /// If the AM is supported, the return value must be >= 0.
     /// If the AM is not supported, the return value must be negative.
     int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
     bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// Returns true if the addresing mode representing by AM is legal
     /// for the Thumb1 target, for a load/store of the specified type.
     bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
     /// compare a register against the immediate without having to materialize
     /// the immediate into a register.
     bool isLegalICmpImmediate(int64_t Imm) const override;
 
     /// isLegalAddImmediate - Return true if the specified immediate is legal
     /// add immediate, that is the target has add instructions which can
     /// add a register and the immediate without having to materialize
     /// the immediate into a register.
     bool isLegalAddImmediate(int64_t Imm) const override;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
     /// can be legally represented as pre-indexed load / store address.
     bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
                                    ISD::MemIndexedMode &AM,
                                    SelectionDAG &DAG) const override;
 
     /// getPostIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if this node can be
     /// combined with a load / store to form a post-indexed load / store.
     bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
                                     SDValue &Offset, ISD::MemIndexedMode &AM,
                                     SelectionDAG &DAG) const override;
 
     void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
                                        const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth) const override;
 
 
     bool ExpandInlineAsm(CallInst *CI) const override;
 
     ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
     ConstraintWeight getSingleConstraintMatchWeight(
       AsmOperandInfo &info, const char *constraint) const override;
 
     std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                  StringRef Constraint, MVT VT) const override;
 
     const char *LowerXConstraint(EVT ConstraintVT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
     /// true it means one of the asm constraint of the inline asm instruction
     /// being processed is 'm'.
     void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                       std::vector<SDValue> &Ops,
                                       SelectionDAG &DAG) const override;
 
     unsigned
     getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
       if (ConstraintCode == "Q")
         return InlineAsm::Constraint_Q;
       else if (ConstraintCode == "o")
         return InlineAsm::Constraint_o;
       else if (ConstraintCode.size() == 2) {
         if (ConstraintCode[0] == 'U') {
           switch(ConstraintCode[1]) {
           default:
             break;
           case 'm':
             return InlineAsm::Constraint_Um;
           case 'n':
             return InlineAsm::Constraint_Un;
           case 'q':
             return InlineAsm::Constraint_Uq;
           case 's':
             return InlineAsm::Constraint_Us;
           case 't':
             return InlineAsm::Constraint_Ut;
           case 'v':
             return InlineAsm::Constraint_Uv;
           case 'y':
             return InlineAsm::Constraint_Uy;
           }
         }
       }
       return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
     }
 
     const ARMSubtarget* getSubtarget() const {
       return Subtarget;
     }
 
     /// getRegClassFor - Return the register class that should be used for the
     /// specified value type.
     const TargetRegisterClass *getRegClassFor(MVT VT) const override;
 
     /// Returns true if a cast between SrcAS and DestAS is a noop.
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
       // Addrspacecasts are always noops.
       return true;
     }
 
     bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize,
                                 unsigned &PrefAlign) const override;
 
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo) const override;
 
     Sched::Preference getSchedulingPreference(SDNode *N) const override;
 
     bool
     isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const override;
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
     /// isFPImmLegal - Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
     bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
     bool getTgtMemIntrinsic(IntrinsicInfo &Info,
                             const CallInst &I,
                             MachineFunction &MF,
                             unsigned Intrinsic) const override;
 
     /// Returns true if it is beneficial to convert a load of a constant
     /// to just the constant itself.
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
     /// with this index.
     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                  unsigned Index) const override;
 
     /// Returns true if an argument of type Ty needs to be passed in a
     /// contiguous block of registers in calling convention CallConv.
     bool functionArgumentNeedsConsecutiveRegisters(
         Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
     unsigned
     getExceptionPointerRegister(const Constant *PersonalityFn) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception typeid on entry to a landing pad.
     unsigned
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
     Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const;
     Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                           AtomicOrdering Ord) const override;
     Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                                 Value *Addr, AtomicOrdering Ord) const override;
 
     void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override;
 
     Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst,
                                   AtomicOrdering Ord) const override;
     Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst,
                                    AtomicOrdering Ord) const override;
 
     unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
     bool lowerInterleavedLoad(LoadInst *LI,
                               ArrayRef<ShuffleVectorInst *> Shuffles,
                               ArrayRef<unsigned> Indices,
                               unsigned Factor) const override;
     bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                                unsigned Factor) const override;
 
     bool shouldInsertFencesForAtomic(const Instruction *I) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
     bool shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
 
     bool useLoadStackGuardNode() const override;
 
     bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                    unsigned &Cost) const override;
 
     bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
                           const SelectionDAG &DAG) const override {
       // Do not merge to larger than i32.
       return (MemVT.getSizeInBits() <= 32);
     }
 
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 
     bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
       return VT.isScalarInteger();
     }
 
     bool supportSwiftError() const override {
       return true;
     }
 
     bool hasStandaloneRem(EVT VT) const override {
       return HasStandaloneRem;
     }
 
     CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const;
     CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const;
 
     /// Returns true if \p VecTy is a legal interleaved access type. This
     /// function checks the vector element type and the overall width of the
     /// vector.
     bool isLegalInterleavedAccessType(VectorType *VecTy,
                                       const DataLayout &DL) const;
 
     /// Returns the number of interleaved accesses that will be generated when
     /// lowering accesses of the given type.
     unsigned getNumInterleavedAccesses(VectorType *VecTy,
                                        const DataLayout &DL) const;
 
     void finalizeLowering(MachineFunction &MF) const override;
 
     /// Return the correct alignment for the current calling convention.
     unsigned getABIAlignmentForCallingConv(Type *ArgTy,
                                            DataLayout DL) const override;
 
+    bool isDesirableToCommuteWithShift(const SDNode *N,
+                                       CombineLevel Level) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
                             MVT VT) const override;
 
   private:
     /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
     /// make the right decision when generating code for different targets.
     const ARMSubtarget *Subtarget;
 
     const TargetRegisterInfo *RegInfo;
 
     const InstrItineraryData *Itins;
 
     /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
     unsigned ARMPCLabelIndex;
 
     // TODO: remove this, and have shouldInsertFencesForAtomic do the proper
     // check.
     bool InsertFencesForAtomic;
 
     bool HasStandaloneRem = true;
 
     void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
     void addDRTypeForNEON(MVT VT);
     void addQRTypeForNEON(MVT VT);
     std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
 
     using RegsToPassVector = SmallVector<std::pair<unsigned, SDValue>, 8>;
 
     void PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG, SDValue Chain,
                           SDValue &Arg, RegsToPassVector &RegsToPass,
                           CCValAssign &VA, CCValAssign &NextVA,
                           SDValue &StackPtr,
                           SmallVectorImpl<SDValue> &MemOpChains,
                           ISD::ArgFlagsTy Flags) const;
     SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                  SDValue &Root, SelectionDAG &DAG,
                                  const SDLoc &dl) const;
 
     CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC,
                                             bool isVarArg) const;
     CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
                                   bool isVarArg) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
                              const SDLoc &dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
                              ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                     const ARMSubtarget *Subtarget) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                             SelectionDAG &DAG) const;
     SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
                                  SelectionDAG &DAG,
                                  TLSModel::Model model) const;
     SDValue LowerGlobalTLSAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddressWindows(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSignedALUO(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUnsignedALUO(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantFP(SDValue Op, SelectionDAG &DAG,
                             const ARMSubtarget *ST) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
     SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
     void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
                            SmallVectorImpl<SDValue> &Results) const;
     SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed,
                                    SDValue &Chain) const;
     SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
     unsigned getRegisterByName(const char* RegName, EVT VT,
                                SelectionDAG &DAG) const override;
 
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
     /// expanded to FMAs when this method returns true, otherwise fmuladd is
     /// expanded to fmul + fadd.
     ///
     /// ARM supports both fused and unfused multiply-add operations; we already
     /// lower a pair of fmul and fadd to the latter so it's not clear that there
     /// would be a gain or that the gain would be worthwhile enough to risk
     /// correctness bugs.
     bool isFMAFasterThanFMulAndFAdd(EVT VT) const override { return false; }
 
     SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
                             const SDLoc &dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
                             SDValue ThisVal) const;
 
     bool supportSplitCSR(MachineFunction *MF) const override {
       return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
           MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
     }
 
     void initializeSplitCSR(MachineBasicBlock *Entry) const override;
     void insertCopiesSplitCSR(
       MachineBasicBlock *Entry,
       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
     SDValue
     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                          const SmallVectorImpl<ISD::InputArg> &Ins,
                          const SDLoc &dl, SelectionDAG &DAG,
                          SmallVectorImpl<SDValue> &InVals) const override;
 
     int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &dl,
                        SDValue &Chain, const Value *OrigArg,
                        unsigned InRegsParamRecordIdx, int ArgOffset,
                        unsigned ArgSize) const;
 
     void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                               const SDLoc &dl, SDValue &Chain,
                               unsigned ArgOffset, unsigned TotalArgRegsSaveSize,
                               bool ForceMutable = false) const;
 
     SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
                       SmallVectorImpl<SDValue> &InVals) const override;
 
     /// HandleByVal - Target-specific cleanup for ByVal support.
     void HandleByVal(CCState *, unsigned &, unsigned) const override;
 
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization. Targets which want to do tail call
     /// optimization should implement this function.
     bool IsEligibleForTailCallOptimization(SDValue Callee,
                                            CallingConv::ID CalleeCC,
                                            bool isVarArg,
                                            bool isCalleeStructRet,
                                            bool isCallerStructRet,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                            SelectionDAG& DAG) const;
 
     bool CanLowerReturn(CallingConv::ID CallConv,
                         MachineFunction &MF, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const override;
 
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
                         const SDLoc &dl, SelectionDAG &DAG) const override;
 
     bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
 
     bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
 
     SDValue getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal, SDValue TrueVal,
                     SDValue ARMcc, SDValue CCR, SDValue Cmp,
                     SelectionDAG &DAG) const;
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
                       const SDLoc &dl, bool InvalidOnQNaN) const;
     SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
 
     void SetupEntryBlockForSjLj(MachineInstr &MI, MachineBasicBlock *MBB,
                                 MachineBasicBlock *DispatchBB, int FI) const;
 
     void EmitSjLjDispatchBlock(MachineInstr &MI, MachineBasicBlock *MBB) const;
 
     bool RemapAddSubWithFlags(MachineInstr &MI, MachineBasicBlock *BB) const;
 
     MachineBasicBlock *EmitStructByval(MachineInstr &MI,
                                        MachineBasicBlock *MBB) const;
 
     MachineBasicBlock *EmitLowered__chkstk(MachineInstr &MI,
                                            MachineBasicBlock *MBB) const;
     MachineBasicBlock *EmitLowered__dbzchk(MachineInstr &MI,
                                            MachineBasicBlock *MBB) const;
   };
 
   enum NEONModImmType {
     VMOVModImm,
     VMVNModImm,
     OtherModImm
   };
 
   namespace ARM {
 
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo);
 
   } // end namespace ARM
 
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_ARM_ARMISELLOWERING_H