Index: head/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp	(revision 280864)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp	(revision 280865)
@@ -1,537 +1,572 @@
 //===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains a printer that converts from our internal representation
 // of machine-dependent LLVM code to the AArch64 assembly language.
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "AArch64.h"
 #include "AArch64MCInstLower.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "InstPrinter/AArch64InstPrinter.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
 namespace {
 
 class AArch64AsmPrinter : public AsmPrinter {
   /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when printing asm code for different targets.
   const AArch64Subtarget *Subtarget;
 
   AArch64MCInstLower MCInstLowering;
   StackMaps SM;
 
 public:
   AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer),
         Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
         MCInstLowering(OutContext, *this), SM(*this), AArch64FI(nullptr),
         LOHLabelCounter(0) {}
 
   const char *getPassName() const override {
     return "AArch64 Assembly Printer";
   }
 
   /// \brief Wrapper for MCInstLowering.lowerOperand() for the
   /// tblgen'erated pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
     return MCInstLowering.lowerOperand(MO, MCOp);
   }
 
   void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
                      const MachineInstr &MI);
   void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
                        const MachineInstr &MI);
   /// \brief tblgen'erated driver function for lowering simple MI->MC
   /// pseudo instructions.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
 
   void EmitInstruction(const MachineInstr *MI) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AsmPrinter::getAnalysisUsage(AU);
     AU.setPreservesAll();
   }
 
   bool runOnMachineFunction(MachineFunction &F) override {
     AArch64FI = F.getInfo<AArch64FunctionInfo>();
     return AsmPrinter::runOnMachineFunction(F);
   }
 
 private:
   MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
   bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
   bool printAsmRegInClass(const MachineOperand &MO,
                           const TargetRegisterClass *RC, bool isVector,
                           raw_ostream &O);
 
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &O) override;
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
                              unsigned AsmVariant, const char *ExtraCode,
                              raw_ostream &O) override;
 
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 
   void EmitFunctionBodyEnd() override;
 
   MCSymbol *GetCPISymbol(unsigned CPID) const override;
   void EmitEndOfAsmFile(Module &M) override;
   AArch64FunctionInfo *AArch64FI;
 
   /// \brief Emit the LOHs contained in AArch64FI.
   void EmitLOHs();
 
   typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
   MInstToMCSymbol LOHInstToLabel;
   unsigned LOHLabelCounter;
 };
 
 } // end of anonymous namespace
 
 //===----------------------------------------------------------------------===//
 
 void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
   if (Subtarget->isTargetMachO()) {
     // Funny Darwin hack: This flag tells the linker that no global symbols
     // contain code that falls through to other global symbols (e.g. the obvious
     // implementation of multiple entry points).  If this doesn't occur, the
     // linker can safely perform dead code stripping.  Since LLVM never
     // generates code that does this, it is always safe to set.
     OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
     SM.serializeToStackMapSection();
   }
 
   // Emit a .data.rel section containing any stubs that were created.
   if (Subtarget->isTargetELF()) {
     const TargetLoweringObjectFileELF &TLOFELF =
       static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
 
     MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
 
     // Output stubs for external and common global variables.
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
       const DataLayout *TD = TM.getSubtargetImpl()->getDataLayout();
 
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);
         OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
                                     TD->getPointerSize(0));
       }
       Stubs.clear();
     }
   }
 
 }
 
 MachineLocation
 AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
   MachineLocation Location;
   assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
   // Frame address.  Currently handles register +- offset only.
   if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
     Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
   else {
     DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
   }
   return Location;
 }
 
 void AArch64AsmPrinter::EmitLOHs() {
   SmallVector<MCSymbol *, 3> MCArgs;
 
   for (const auto &D : AArch64FI->getLOHContainer()) {
     for (const MachineInstr *MI : D.getArgs()) {
       MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
       assert(LabelIt != LOHInstToLabel.end() &&
              "Label hasn't been inserted for LOH related instruction");
       MCArgs.push_back(LabelIt->second);
     }
     OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
     MCArgs.clear();
   }
 }
 
 void AArch64AsmPrinter::EmitFunctionBodyEnd() {
   if (!AArch64FI->getLOHRelated().empty())
     EmitLOHs();
 }
 
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
   // Darwin uses a linker-private symbol name for constant-pools (to
   // avoid addends on the relocation?), ELF has no such concept and
   // uses a normal private symbol.
   if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
     return OutContext.GetOrCreateSymbol(
         Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
         Twine(getFunctionNumber()) + "_" + Twine(CPID));
 
   return OutContext.GetOrCreateSymbol(
       Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
       Twine(getFunctionNumber()) + "_" + Twine(CPID));
 }
 
 void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
                                      raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNum);
   switch (MO.getType()) {
   default:
     llvm_unreachable("<unknown operand type>");
   case MachineOperand::MO_Register: {
     unsigned Reg = MO.getReg();
     assert(TargetRegisterInfo::isPhysicalRegister(Reg));
     assert(!MO.getSubReg() && "Subregs should be eliminated!");
     O << AArch64InstPrinter::getRegisterName(Reg);
     break;
   }
   case MachineOperand::MO_Immediate: {
     int64_t Imm = MO.getImm();
     O << '#' << Imm;
     break;
   }
   }
 }
 
 bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
                                           raw_ostream &O) {
   unsigned Reg = MO.getReg();
   switch (Mode) {
   default:
     return true; // Unknown mode.
   case 'w':
     Reg = getWRegFromXReg(Reg);
     break;
   case 'x':
     Reg = getXRegFromWReg(Reg);
     break;
   }
 
   O << AArch64InstPrinter::getRegisterName(Reg);
   return false;
 }
 
 // Prints the register in MO using class RC using the offset in the
 // new register class. This should not be used for cross class
 // printing.
 bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
                                            const TargetRegisterClass *RC,
                                            bool isVector, raw_ostream &O) {
   assert(MO.isReg() && "Should only get here with a register!");
   const AArch64RegisterInfo *RI = static_cast<const AArch64RegisterInfo *>(
       TM.getSubtargetImpl()->getRegisterInfo());
   unsigned Reg = MO.getReg();
   unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
   assert(RI->regsOverlap(RegToPrint, Reg));
   O << AArch64InstPrinter::getRegisterName(
            RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
   return false;
 }
 
 bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                         unsigned AsmVariant,
                                         const char *ExtraCode, raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNum);
 
   // First try the generic code, which knows about modifiers like 'c' and 'n'.
   if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
     return false;
 
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0)
       return true; // Unknown modifier.
 
     switch (ExtraCode[0]) {
     default:
       return true; // Unknown modifier.
     case 'w':      // Print W register
     case 'x':      // Print X register
       if (MO.isReg())
         return printAsmMRegister(MO, ExtraCode[0], O);
       if (MO.isImm() && MO.getImm() == 0) {
         unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
         O << AArch64InstPrinter::getRegisterName(Reg);
         return false;
       }
       printOperand(MI, OpNum, O);
       return false;
     case 'b': // Print B register.
     case 'h': // Print H register.
     case 's': // Print S register.
     case 'd': // Print D register.
     case 'q': // Print Q register.
       if (MO.isReg()) {
         const TargetRegisterClass *RC;
         switch (ExtraCode[0]) {
         case 'b':
           RC = &AArch64::FPR8RegClass;
           break;
         case 'h':
           RC = &AArch64::FPR16RegClass;
           break;
         case 's':
           RC = &AArch64::FPR32RegClass;
           break;
         case 'd':
           RC = &AArch64::FPR64RegClass;
           break;
         case 'q':
           RC = &AArch64::FPR128RegClass;
           break;
         default:
           return true;
         }
         return printAsmRegInClass(MO, RC, false /* vector */, O);
       }
       printOperand(MI, OpNum, O);
       return false;
     }
   }
 
   // According to ARM, we should emit x and v registers unless we have a
   // modifier.
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
 
     // If this is a w or x register, print an x register.
     if (AArch64::GPR32allRegClass.contains(Reg) ||
         AArch64::GPR64allRegClass.contains(Reg))
       return printAsmMRegister(MO, 'x', O);
 
     // If this is a b, h, s, d, or q register, print it as a v register.
     return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
                               O);
   }
 
   printOperand(MI, OpNum, O);
   return false;
 }
 
 bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                               unsigned OpNum,
                                               unsigned AsmVariant,
                                               const char *ExtraCode,
                                               raw_ostream &O) {
   if (ExtraCode && ExtraCode[0])
     return true; // Unknown modifier.
 
   const MachineOperand &MO = MI->getOperand(OpNum);
   assert(MO.isReg() && "unexpected inline asm memory operand");
   O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
   return false;
 }
 
 void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
                                                raw_ostream &OS) {
   unsigned NOps = MI->getNumOperands();
   assert(NOps == 4);
   OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
   // cast away const; DIetc do not take const operands for some reason.
   DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
   OS << V.getName();
   OS << " <- ";
   // Frame address.  Currently handles register +- offset only.
   assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
   OS << '[';
   printOperand(MI, 0, OS);
   OS << '+';
   printOperand(MI, 1, OS);
   OS << ']';
   OS << "+";
   printOperand(MI, NOps - 2, OS);
 }
 
 void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
                                       const MachineInstr &MI) {
   unsigned NumNOPBytes = MI.getOperand(1).getImm();
 
   SM.recordStackMap(MI);
   assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
 
   // Scan ahead to trim the shadow.
   const MachineBasicBlock &MBB = *MI.getParent();
   MachineBasicBlock::const_iterator MII(MI);
   ++MII;
   while (NumNOPBytes > 0) {
     if (MII == MBB.end() || MII->isCall() ||
         MII->getOpcode() == AArch64::DBG_VALUE ||
         MII->getOpcode() == TargetOpcode::PATCHPOINT ||
         MII->getOpcode() == TargetOpcode::STACKMAP)
       break;
     ++MII;
     NumNOPBytes -= 4;
   }
 
   // Emit nops.
   for (unsigned i = 0; i < NumNOPBytes; i += 4)
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 }
 
 // Lower a patchpoint of the form:
 // [<def>], <id>, <numBytes>, <target>, <numArgs>
 void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
                                         const MachineInstr &MI) {
   SM.recordPatchPoint(MI);
 
   PatchPointOpers Opers(&MI);
 
   int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
   unsigned EncodedBytes = 0;
   if (CallTarget) {
     assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
            "High 16 bits of call target should be zero.");
     unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
     EncodedBytes = 16;
     // Materialize the jump address:
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 32) & 0xFFFF)
                                     .addImm(32));
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm((CallTarget >> 16) & 0xFFFF)
                                     .addImm(16));
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
                                     .addReg(ScratchReg)
                                     .addReg(ScratchReg)
                                     .addImm(CallTarget & 0xFFFF)
                                     .addImm(0));
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
   }
   // Emit padding.
   unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
   assert(NumBytes >= EncodedBytes &&
          "Patchpoint can't request size less than the length of a call.");
   assert((NumBytes - EncodedBytes) % 4 == 0 &&
          "Invalid number of NOP bytes requested!");
   for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
     EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 }
 
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
 
 void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   // Do any auto-generated pseudo lowerings.
   if (emitPseudoExpansionLowering(OutStreamer, MI))
     return;
 
   if (AArch64FI->getLOHRelated().count(MI)) {
     // Generate a label for LOH related instruction
     MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
     // Associate the instruction with the label
     LOHInstToLabel[MI] = LOHLabel;
     OutStreamer.EmitLabel(LOHLabel);
   }
 
   // Do any manual lowerings.
   switch (MI->getOpcode()) {
   default:
     break;
   case AArch64::DBG_VALUE: {
     if (isVerbose() && OutStreamer.hasRawTextSupport()) {
       SmallString<128> TmpStr;
       raw_svector_ostream OS(TmpStr);
       PrintDebugValueComment(MI, OS);
       OutStreamer.EmitRawText(StringRef(OS.str()));
     }
     return;
   }
 
   // Tail calls use pseudo instructions so they have the proper code-gen
   // attributes (isCall, isReturn, etc.). We lower them to the real
   // instruction here.
   case AArch64::TCRETURNri: {
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::BR);
     TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
     EmitToStreamer(OutStreamer, TmpInst);
     return;
   }
   case AArch64::TCRETURNdi: {
     MCOperand Dest;
     MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
     MCInst TmpInst;
     TmpInst.setOpcode(AArch64::B);
     TmpInst.addOperand(Dest);
     EmitToStreamer(OutStreamer, TmpInst);
     return;
   }
-  case AArch64::TLSDESC_BLR: {
-    MCOperand Callee, Sym;
-    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
-    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
+  case AArch64::TLSDESC_CALLSEQ: {
+    /// lower this to:
+    ///    adrp  x0, :tlsdesc:var
+    ///    ldr   x1, [x0, #:tlsdesc_lo12:var]
+    ///    add   x0, x0, #:tlsdesc_lo12:var
+    ///    .tlsdesccall var
+    ///    blr   x1
+    ///    (TPIDR_EL0 offset now in x0)
+    const MachineOperand &MO_Sym = MI->getOperand(0);
+    MachineOperand MO_TLSDESC_LO12(MO_Sym), MO_TLSDESC(MO_Sym);
+    MCOperand Sym, SymTLSDescLo12, SymTLSDesc;
+    MO_TLSDESC_LO12.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGEOFF |
+                                   AArch64II::MO_NC);
+    MO_TLSDESC.setTargetFlags(AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    MCInstLowering.lowerOperand(MO_Sym, Sym);
+    MCInstLowering.lowerOperand(MO_TLSDESC_LO12, SymTLSDescLo12);
+    MCInstLowering.lowerOperand(MO_TLSDESC, SymTLSDesc);
 
-    // First emit a relocation-annotation. This expands to no code, but requests
+    MCInst Adrp;
+    Adrp.setOpcode(AArch64::ADRP);
+    Adrp.addOperand(MCOperand::CreateReg(AArch64::X0));
+    Adrp.addOperand(SymTLSDesc);
+    EmitToStreamer(OutStreamer, Adrp);
+
+    MCInst Ldr;
+    Ldr.setOpcode(AArch64::LDRXui);
+    Ldr.addOperand(MCOperand::CreateReg(AArch64::X1));
+    Ldr.addOperand(MCOperand::CreateReg(AArch64::X0));
+    Ldr.addOperand(SymTLSDescLo12);
+    Ldr.addOperand(MCOperand::CreateImm(0));
+    EmitToStreamer(OutStreamer, Ldr);
+
+    MCInst Add;
+    Add.setOpcode(AArch64::ADDXri);
+    Add.addOperand(MCOperand::CreateReg(AArch64::X0));
+    Add.addOperand(MCOperand::CreateReg(AArch64::X0));
+    Add.addOperand(SymTLSDescLo12);
+    Add.addOperand(MCOperand::CreateImm(AArch64_AM::getShiftValue(0)));
+    EmitToStreamer(OutStreamer, Add);
+
+    // Emit a relocation-annotation. This expands to no code, but requests
     // the following instruction gets an R_AARCH64_TLSDESC_CALL.
     MCInst TLSDescCall;
     TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
     TLSDescCall.addOperand(Sym);
     EmitToStreamer(OutStreamer, TLSDescCall);
 
-    // Other than that it's just a normal indirect call to the function loaded
-    // from the descriptor.
-    MCInst BLR;
-    BLR.setOpcode(AArch64::BLR);
-    BLR.addOperand(Callee);
-    EmitToStreamer(OutStreamer, BLR);
+    MCInst Blr;
+    Blr.setOpcode(AArch64::BLR);
+    Blr.addOperand(MCOperand::CreateReg(AArch64::X1));
+    EmitToStreamer(OutStreamer, Blr);
 
     return;
   }
 
   case TargetOpcode::STACKMAP:
     return LowerSTACKMAP(OutStreamer, SM, *MI);
 
   case TargetOpcode::PATCHPOINT:
     return LowerPATCHPOINT(OutStreamer, SM, *MI);
   }
 
   // Finally, do the automated lowerings for everything else.
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
   EmitToStreamer(OutStreamer, TmpInst);
 }
 
 // Force static initialization.
 extern "C" void LLVMInitializeAArch64AsmPrinter() {
   RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
   RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
   RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64Target);
 }
Index: head/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp	(revision 280864)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp	(revision 280865)
@@ -1,147 +1,147 @@
 //===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // Local-dynamic access to thread-local variables proceeds in three stages.
 //
 // 1. The offset of this Module's thread-local area from TPIDR_EL0 is calculated
 //    in much the same way as a general-dynamic TLS-descriptor access against
 //    the special symbol _TLS_MODULE_BASE.
 // 2. The variable's offset from _TLS_MODULE_BASE_ is calculated using
 //    instructions with "dtprel" modifiers.
 // 3. These two are added, together with TPIDR_EL0, to obtain the variable's
 //    true address.
 //
 // This is only better than general-dynamic access to the variable if two or
 // more of the first stage TLS-descriptor calculations can be combined. This
 // pass looks through a function and performs such combinations.
 //
 //===----------------------------------------------------------------------===//
 #include "AArch64.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64TargetMachine.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 using namespace llvm;
 
 namespace {
 struct LDTLSCleanup : public MachineFunctionPass {
   static char ID;
   LDTLSCleanup() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override {
     AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
       // No point folding accesses if there isn't at least two.
       return false;
     }
 
     MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
     return VisitNode(DT->getRootNode(), 0);
   }
 
   // Visit the dominator subtree rooted at Node in pre-order.
   // If TLSBaseAddrReg is non-null, then use that to replace any
   // TLS_base_addr instructions. Otherwise, create the register
   // when the first such instruction is seen, and then use it
   // as we encounter more instructions.
   bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
     MachineBasicBlock *BB = Node->getBlock();
     bool Changed = false;
 
     // Traverse the current block.
     for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
          ++I) {
       switch (I->getOpcode()) {
-      case AArch64::TLSDESC_BLR:
+      case AArch64::TLSDESC_CALLSEQ:
         // Make sure it's a local dynamic access.
-        if (!I->getOperand(1).isSymbol() ||
-            strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
+        if (!I->getOperand(0).isSymbol() ||
+            strcmp(I->getOperand(0).getSymbolName(), "_TLS_MODULE_BASE_"))
           break;
 
         if (TLSBaseAddrReg)
           I = replaceTLSBaseAddrCall(I, TLSBaseAddrReg);
         else
           I = setRegister(I, &TLSBaseAddrReg);
         Changed = true;
         break;
       default:
         break;
       }
     }
 
     // Visit the children of this block in the dominator tree.
     for (MachineDomTreeNode *N : *Node) {
       Changed |= VisitNode(N, TLSBaseAddrReg);
     }
 
     return Changed;
   }
 
   // Replace the TLS_base_addr instruction I with a copy from
   // TLSBaseAddrReg, returning the new instruction.
   MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
                                        unsigned TLSBaseAddrReg) {
     MachineFunction *MF = I->getParent()->getParent();
     const AArch64TargetMachine *TM =
         static_cast<const AArch64TargetMachine *>(&MF->getTarget());
     const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
     // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
     // code sequence assumes the address will be.
     MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
                                  TII->get(TargetOpcode::COPY),
                                  AArch64::X0).addReg(TLSBaseAddrReg);
 
     // Erase the TLS_base_addr instruction.
     I->eraseFromParent();
 
     return Copy;
   }
 
   // Create a virtal register in *TLSBaseAddrReg, and populate it by
   // inserting a copy instruction after I. Returns the new instruction.
   MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
     MachineFunction *MF = I->getParent()->getParent();
     const AArch64TargetMachine *TM =
         static_cast<const AArch64TargetMachine *>(&MF->getTarget());
     const AArch64InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
     // Create a virtual register for the TLS base address.
     MachineRegisterInfo &RegInfo = MF->getRegInfo();
     *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
 
     // Insert a copy from X0 to TLSBaseAddrReg for later.
     MachineInstr *Next = I->getNextNode();
     MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
                                  TII->get(TargetOpcode::COPY),
                                  *TLSBaseAddrReg).addReg(AArch64::X0);
 
     return Copy;
   }
 
   const char *getPassName() const override {
     return "Local Dynamic TLS Access Clean-up";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
 }
 
 char LDTLSCleanup::ID = 0;
 FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() {
   return new LDTLSCleanup();
 }
Index: head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp	(revision 280864)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp	(revision 280865)
@@ -1,8878 +1,8841 @@
 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the AArch64TargetLowering class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64ISelLowering.h"
 #include "AArch64CallingConvention.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64PerfectShuffle.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "aarch64-lower"
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
 
 namespace {
 enum AlignMode {
   StrictAlign,
   NoStrictAlign
 };
 }
 
 static cl::opt<AlignMode>
 Align(cl::desc("Load/store alignment support"),
       cl::Hidden, cl::init(NoStrictAlign),
       cl::values(
           clEnumValN(StrictAlign,   "aarch64-strict-align",
                      "Disallow all unaligned memory accesses"),
           clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
                      "Allow unaligned memory accesses"),
           clEnumValEnd));
 
 // Place holder until extr generation is tested fully.
 static cl::opt<bool>
 EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
                           cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
                           cl::init(true));
 
 static cl::opt<bool>
 EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
-                         cl::desc("Allow AArch64 SLI/SRI formation"),
-                         cl::init(false));
+                           cl::desc("Allow AArch64 SLI/SRI formation"),
+                           cl::init(false));
 
+// FIXME: The necessary dtprel relocations don't seem to be supported
+// well in the GNU bfd and gold linkers at the moment. Therefore, by
+// default, for now, fall back to GeneralDynamic code generation.
+cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
+    "aarch64-elf-ldtls-generation", cl::Hidden,
+    cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
+    cl::init(false));
 
+
 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM)
     : TargetLowering(TM) {
   Subtarget = &TM.getSubtarget<AArch64Subtarget>();
 
   // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
   // we have to make something up. Arbitrarily, choose ZeroOrOne.
   setBooleanContents(ZeroOrOneBooleanContent);
   // When comparing vectors the result sets the different elements in the
   // vector to all-one or all-zero.
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
   // Set up the register classes.
   addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
   addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
 
   if (Subtarget->hasFPARMv8()) {
     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
   }
 
   if (Subtarget->hasNEON()) {
     addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
     addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
     // Someone set us up the NEON.
     addDRTypeForNEON(MVT::v2f32);
     addDRTypeForNEON(MVT::v8i8);
     addDRTypeForNEON(MVT::v4i16);
     addDRTypeForNEON(MVT::v2i32);
     addDRTypeForNEON(MVT::v1i64);
     addDRTypeForNEON(MVT::v1f64);
     addDRTypeForNEON(MVT::v4f16);
 
     addQRTypeForNEON(MVT::v4f32);
     addQRTypeForNEON(MVT::v2f64);
     addQRTypeForNEON(MVT::v16i8);
     addQRTypeForNEON(MVT::v8i16);
     addQRTypeForNEON(MVT::v4i32);
     addQRTypeForNEON(MVT::v2i64);
     addQRTypeForNEON(MVT::v8f16);
   }
 
   // Compute derived properties from the register classes
   computeRegisterProperties();
 
   // Provide all sorts of operation actions
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
   setOperationAction(ISD::SETCC, MVT::i32, Custom);
   setOperationAction(ISD::SETCC, MVT::i64, Custom);
   setOperationAction(ISD::SETCC, MVT::f32, Custom);
   setOperationAction(ISD::SETCC, MVT::f64, Custom);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
   setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
   setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
 
   setOperationAction(ISD::FREM, MVT::f32, Expand);
   setOperationAction(ISD::FREM, MVT::f64, Expand);
   setOperationAction(ISD::FREM, MVT::f80, Expand);
 
   // Custom lowering hooks are needed for XOR
   // to fold it into CSINC/CSINV.
   setOperationAction(ISD::XOR, MVT::i32, Custom);
   setOperationAction(ISD::XOR, MVT::i64, Custom);
 
   // Virtually no operation on f128 is legal, but LLVM can't expand them when
   // there's a valid register class, so we need custom operations in most cases.
   setOperationAction(ISD::FABS, MVT::f128, Expand);
   setOperationAction(ISD::FADD, MVT::f128, Custom);
   setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
   setOperationAction(ISD::FCOS, MVT::f128, Expand);
   setOperationAction(ISD::FDIV, MVT::f128, Custom);
   setOperationAction(ISD::FMA, MVT::f128, Expand);
   setOperationAction(ISD::FMUL, MVT::f128, Custom);
   setOperationAction(ISD::FNEG, MVT::f128, Expand);
   setOperationAction(ISD::FPOW, MVT::f128, Expand);
   setOperationAction(ISD::FREM, MVT::f128, Expand);
   setOperationAction(ISD::FRINT, MVT::f128, Expand);
   setOperationAction(ISD::FSIN, MVT::f128, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
   setOperationAction(ISD::FSQRT, MVT::f128, Expand);
   setOperationAction(ISD::FSUB, MVT::f128, Custom);
   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
   setOperationAction(ISD::SETCC, MVT::f128, Custom);
   setOperationAction(ISD::BR_CC, MVT::f128, Custom);
   setOperationAction(ISD::SELECT, MVT::f128, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
   setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
 
   // Lowering for many of the conversions is actually specified by the non-f128
   // type. The LowerXXX function will be trivial when f128 isn't involved.
   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
 
   // Variable arguments.
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   setOperationAction(ISD::VAARG, MVT::Other, Custom);
   setOperationAction(ISD::VACOPY, MVT::Other, Custom);
   setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
   // Variable-sized objects.
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 
   // Exception handling.
   // FIXME: These are guesses. Has this been defined yet?
   setExceptionPointerRegister(AArch64::X0);
   setExceptionSelectorRegister(AArch64::X1);
 
   // Constant pool entries
   setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 
   // BlockAddress
   setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
 
   // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
   setOperationAction(ISD::ADDC, MVT::i32, Custom);
   setOperationAction(ISD::ADDE, MVT::i32, Custom);
   setOperationAction(ISD::SUBC, MVT::i32, Custom);
   setOperationAction(ISD::SUBE, MVT::i32, Custom);
   setOperationAction(ISD::ADDC, MVT::i64, Custom);
   setOperationAction(ISD::ADDE, MVT::i64, Custom);
   setOperationAction(ISD::SUBC, MVT::i64, Custom);
   setOperationAction(ISD::SUBE, MVT::i64, Custom);
 
   // AArch64 lacks both left-rotate and popcount instructions.
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
   setOperationAction(ISD::ROTL, MVT::i64, Expand);
 
   // AArch64 doesn't have {U|S}MUL_LOHI.
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
 
   // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
   // counterparts, which AArch64 supports directly.
   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
   setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 
   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::SREM, MVT::i64, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
   setOperationAction(ISD::UREM, MVT::i64, Expand);
 
   // Custom lower Add/Sub/Mul with overflow.
   setOperationAction(ISD::SADDO, MVT::i32, Custom);
   setOperationAction(ISD::SADDO, MVT::i64, Custom);
   setOperationAction(ISD::UADDO, MVT::i32, Custom);
   setOperationAction(ISD::UADDO, MVT::i64, Custom);
   setOperationAction(ISD::SSUBO, MVT::i32, Custom);
   setOperationAction(ISD::SSUBO, MVT::i64, Custom);
   setOperationAction(ISD::USUBO, MVT::i32, Custom);
   setOperationAction(ISD::USUBO, MVT::i64, Custom);
   setOperationAction(ISD::SMULO, MVT::i32, Custom);
   setOperationAction(ISD::SMULO, MVT::i64, Custom);
   setOperationAction(ISD::UMULO, MVT::i32, Custom);
   setOperationAction(ISD::UMULO, MVT::i64, Custom);
 
   setOperationAction(ISD::FSIN, MVT::f32, Expand);
   setOperationAction(ISD::FSIN, MVT::f64, Expand);
   setOperationAction(ISD::FCOS, MVT::f32, Expand);
   setOperationAction(ISD::FCOS, MVT::f64, Expand);
   setOperationAction(ISD::FPOW, MVT::f32, Expand);
   setOperationAction(ISD::FPOW, MVT::f64, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
 
   // f16 is storage-only, so we promote operations to f32 if we know this is
   // valid, and ignore them otherwise. The operations not mentioned here will
   // fail to select, but this is not a major problem as no source language
   // should be emitting native f16 operations yet.
   setOperationAction(ISD::FADD, MVT::f16, Promote);
   setOperationAction(ISD::FDIV, MVT::f16, Promote);
   setOperationAction(ISD::FMUL, MVT::f16, Promote);
   setOperationAction(ISD::FSUB, MVT::f16, Promote);
 
   // v4f16 is also a storage-only type, so promote it to v4f32 when that is
   // known to be safe.
   setOperationAction(ISD::FADD, MVT::v4f16, Promote);
   setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
   setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
   setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
   setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
   setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
   AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
   AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
   AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
   AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
   AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
   AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
 
   // Expand all other v4f16 operations.
   // FIXME: We could generate better code by promoting some operations to
   // a pair of v4f32s
   setOperationAction(ISD::FABS, MVT::v4f16, Expand);
   setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
   setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
   setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
   setOperationAction(ISD::FMA, MVT::v4f16, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
   setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
   setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
   setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
   setOperationAction(ISD::FREM, MVT::v4f16, Expand);
   setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
   setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
   setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
   setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
   setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
   setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
   setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
   setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
   setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
   setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
   setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
   setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
   setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
   setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
 
 
   // v8f16 is also a storage-only type, so expand it.
   setOperationAction(ISD::FABS, MVT::v8f16, Expand);
   setOperationAction(ISD::FADD, MVT::v8f16, Expand);
   setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
   setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
   setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
   setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
   setOperationAction(ISD::FMA, MVT::v8f16, Expand);
   setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
   setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
   setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
   setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
   setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
   setOperationAction(ISD::FREM, MVT::v8f16, Expand);
   setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
   setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
   setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
   setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
   setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
   setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
   setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
   setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
   setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
   setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
   setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
   setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
   setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
   setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
   setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
   setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
 
   // AArch64 has implementations of a lot of rounding-like FP operations.
   static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
   for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
     MVT Ty = RoundingTypes[I];
     setOperationAction(ISD::FFLOOR, Ty, Legal);
     setOperationAction(ISD::FNEARBYINT, Ty, Legal);
     setOperationAction(ISD::FCEIL, Ty, Legal);
     setOperationAction(ISD::FRINT, Ty, Legal);
     setOperationAction(ISD::FTRUNC, Ty, Legal);
     setOperationAction(ISD::FROUND, Ty, Legal);
   }
 
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
   if (Subtarget->isTargetMachO()) {
     // For iOS, we don't want to the normal expansion of a libcall to
     // sincos. We want to issue a libcall to __sincos_stret to avoid memory
     // traffic.
     setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
     setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
   } else {
     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   }
 
   // Make floating-point constants legal for the large code model, so they don't
   // become loads from the constant pool.
   if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
     setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
     setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
   }
 
   // AArch64 does not have floating-point extending loads, i1 sign-extending
   // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
   for (MVT VT : MVT::fp_valuetypes()) {
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
   }
   for (MVT VT : MVT::integer_valuetypes())
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
 
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f128, MVT::f80, Expand);
   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f16, Expand);
 
   setOperationAction(ISD::BITCAST, MVT::i16, Custom);
   setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 
   // Indexed loads and stores are supported.
   for (unsigned im = (unsigned)ISD::PRE_INC;
        im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
     setIndexedLoadAction(im, MVT::i8, Legal);
     setIndexedLoadAction(im, MVT::i16, Legal);
     setIndexedLoadAction(im, MVT::i32, Legal);
     setIndexedLoadAction(im, MVT::i64, Legal);
     setIndexedLoadAction(im, MVT::f64, Legal);
     setIndexedLoadAction(im, MVT::f32, Legal);
     setIndexedStoreAction(im, MVT::i8, Legal);
     setIndexedStoreAction(im, MVT::i16, Legal);
     setIndexedStoreAction(im, MVT::i32, Legal);
     setIndexedStoreAction(im, MVT::i64, Legal);
     setIndexedStoreAction(im, MVT::f64, Legal);
     setIndexedStoreAction(im, MVT::f32, Legal);
   }
 
   // Trap.
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
   // We combine OR nodes for bitfield operations.
   setTargetDAGCombine(ISD::OR);
 
   // Vector add and sub nodes may conceal a high-half opportunity.
   // Also, try to fold ADD into CSINC/CSINV..
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::SUB);
 
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
 
   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::CONCAT_VECTORS);
   setTargetDAGCombine(ISD::STORE);
 
   setTargetDAGCombine(ISD::MUL);
 
   setTargetDAGCombine(ISD::SELECT);
   setTargetDAGCombine(ISD::VSELECT);
 
   setTargetDAGCombine(ISD::INTRINSIC_VOID);
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
 
   setStackPointerRegisterToSaveRestore(AArch64::SP);
 
   setSchedulingPreference(Sched::Hybrid);
 
   // Enable TBZ/TBNZ
   MaskAndBranchFoldingIsLegal = true;
 
   setMinFunctionAlignment(2);
 
   RequireStrictAlign = (Align == StrictAlign);
 
   setHasExtractBitsInsn(true);
 
   if (Subtarget->hasNEON()) {
     // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
     // silliness like this:
     setOperationAction(ISD::FABS, MVT::v1f64, Expand);
     setOperationAction(ISD::FADD, MVT::v1f64, Expand);
     setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
     setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
     setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
     setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
     setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
     setOperationAction(ISD::FMA, MVT::v1f64, Expand);
     setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
     setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
     setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
     setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
     setOperationAction(ISD::FREM, MVT::v1f64, Expand);
     setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
     setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
     setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
     setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
     setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
     setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
     setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
     setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
     setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
     setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
     setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
     setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
 
     setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
     setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
     setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
     setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
     setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
 
     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 
     // AArch64 doesn't have a direct vector ->f32 conversion instructions for
     // elements smaller than i32, so promote the input to i32 first.
     setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
     setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
     setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
     // Similarly, there is no direct i32 -> f64 vector conversion instruction.
     setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
 
     // AArch64 doesn't have MUL.2d:
     setOperationAction(ISD::MUL, MVT::v2i64, Expand);
     // Custom handling for some quad-vector types to detect MULL.
     setOperationAction(ISD::MUL, MVT::v8i16, Custom);
     setOperationAction(ISD::MUL, MVT::v4i32, Custom);
     setOperationAction(ISD::MUL, MVT::v2i64, Custom);
 
     setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
     setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
     // Likewise, narrowing and extending vector loads/stores aren't handled
     // directly.
     for (MVT VT : MVT::vector_valuetypes()) {
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
 
       setOperationAction(ISD::MULHS, VT, Expand);
       setOperationAction(ISD::SMUL_LOHI, VT, Expand);
       setOperationAction(ISD::MULHU, VT, Expand);
       setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
       setOperationAction(ISD::BSWAP, VT, Expand);
 
       for (MVT InnerVT : MVT::vector_valuetypes()) {
         setTruncStoreAction(VT, InnerVT, Expand);
         setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
         setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
       }
     }
 
     // AArch64 has implementations of a lot of rounding-like FP operations.
     static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
     for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
       MVT Ty = RoundingVecTypes[I];
       setOperationAction(ISD::FFLOOR, Ty, Legal);
       setOperationAction(ISD::FNEARBYINT, Ty, Legal);
       setOperationAction(ISD::FCEIL, Ty, Legal);
       setOperationAction(ISD::FRINT, Ty, Legal);
       setOperationAction(ISD::FTRUNC, Ty, Legal);
       setOperationAction(ISD::FROUND, Ty, Legal);
     }
   }
 
   // Prefer likely predicted branches to selects on out-of-order cores.
   if (Subtarget->isCortexA57())
     PredictableSelectIsExpensive = true;
 }
 
 void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
   if (VT == MVT::v2f32 || VT == MVT::v4f16) {
     setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
     AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
 
     setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
     AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
   } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
     setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
     AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
 
     setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
     AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
   }
 
   // Mark vector float intrinsics as expand.
   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
     setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
     setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
   }
 
   setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
 
   setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
   for (MVT InnerVT : MVT::all_valuetypes())
     setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand);
 
   // CNT supports only B element sizes.
   if (VT != MVT::v8i8 && VT != MVT::v16i8)
     setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
 
   setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
   setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
 
   setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
   setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
 
   if (Subtarget->isLittleEndian()) {
     for (unsigned im = (unsigned)ISD::PRE_INC;
          im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
       setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
       setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
     }
   }
 }
 
 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
   addRegisterClass(VT, &AArch64::FPR64RegClass);
   addTypeForNEON(VT, MVT::v2i32);
 }
 
 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
   addRegisterClass(VT, &AArch64::FPR128RegClass);
   addTypeForNEON(VT, MVT::v4i32);
 }
 
 EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
 }
 
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
 /// Mask are known to be either zero or one and return them in the
 /// KnownZero/KnownOne bitsets.
 void AArch64TargetLowering::computeKnownBitsForTargetNode(
     const SDValue Op, APInt &KnownZero, APInt &KnownOne,
     const SelectionDAG &DAG, unsigned Depth) const {
   switch (Op.getOpcode()) {
   default:
     break;
   case AArch64ISD::CSEL: {
     APInt KnownZero2, KnownOne2;
     DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
     DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
     KnownZero &= KnownZero2;
     KnownOne &= KnownOne2;
     break;
   }
   case ISD::INTRINSIC_W_CHAIN: {
    ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
     Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
     switch (IntID) {
     default: return;
     case Intrinsic::aarch64_ldaxr:
     case Intrinsic::aarch64_ldxr: {
       unsigned BitWidth = KnownOne.getBitWidth();
       EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
       unsigned MemBits = VT.getScalarType().getSizeInBits();
       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
       return;
     }
     }
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN:
   case ISD::INTRINSIC_VOID: {
     unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     switch (IntNo) {
     default:
       break;
     case Intrinsic::aarch64_neon_umaxv:
     case Intrinsic::aarch64_neon_uminv: {
       // Figure out the datatype of the vector operand. The UMINV instruction
       // will zero extend the result, so we can mark as known zero all the
       // bits larger than the element datatype. 32-bit or larget doesn't need
       // this as those are legal types and will be handled by isel directly.
       MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
       unsigned BitWidth = KnownZero.getBitWidth();
       if (VT == MVT::v8i8 || VT == MVT::v16i8) {
         assert(BitWidth >= 8 && "Unexpected width!");
         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
         KnownZero |= Mask;
       } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
         assert(BitWidth >= 16 && "Unexpected width!");
         APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
         KnownZero |= Mask;
       }
       break;
     } break;
     }
   }
   }
 }
 
 MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
   return MVT::i64;
 }
 
 unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
   // FIXME: On AArch64, this depends on the type.
   // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes().
   // and the offset has to be a multiple of the related size in bytes.
   return 4095;
 }
 
 FastISel *
 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
                                       const TargetLibraryInfo *libInfo) const {
   return AArch64::createFastISel(funcInfo, libInfo);
 }
 
 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   default:
     return nullptr;
   case AArch64ISD::CALL:              return "AArch64ISD::CALL";
   case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
   case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
   case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
   case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
   case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
   case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
   case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
   case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
   case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
   case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
   case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
-  case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
+  case AArch64ISD::TLSDESC_CALLSEQ:   return "AArch64ISD::TLSDESC_CALLSEQ";
   case AArch64ISD::ADC:               return "AArch64ISD::ADC";
   case AArch64ISD::SBC:               return "AArch64ISD::SBC";
   case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
   case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
   case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
   case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
   case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
   case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
   case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
   case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
   case AArch64ISD::DUP:               return "AArch64ISD::DUP";
   case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
   case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
   case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
   case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
   case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
   case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
   case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
   case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
   case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
   case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
   case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
   case AArch64ISD::BICi:              return "AArch64ISD::BICi";
   case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
   case AArch64ISD::BSL:               return "AArch64ISD::BSL";
   case AArch64ISD::NEG:               return "AArch64ISD::NEG";
   case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
   case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
   case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
   case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
   case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
   case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
   case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
   case AArch64ISD::REV16:             return "AArch64ISD::REV16";
   case AArch64ISD::REV32:             return "AArch64ISD::REV32";
   case AArch64ISD::REV64:             return "AArch64ISD::REV64";
   case AArch64ISD::EXT:               return "AArch64ISD::EXT";
   case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
   case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
   case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
   case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
   case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
   case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
   case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
   case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
   case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
   case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
   case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
   case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
   case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
   case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
   case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
   case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
   case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
   case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
   case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
   case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
   case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
   case AArch64ISD::NOT:               return "AArch64ISD::NOT";
   case AArch64ISD::BIT:               return "AArch64ISD::BIT";
   case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
   case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
   case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
   case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
   case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
   case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
   case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
   case AArch64ISD::NVCAST:            return "AArch64ISD::NVCAST";
   case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
   case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
   case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
   case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
   case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
   case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
   case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
   case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
   case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
   case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
   case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
   case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
   case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
   case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
   case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
   case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
   case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
   case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
   case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
   case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
   case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
   case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
   case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
   case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
   case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
   case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
   case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
   case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
   case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
   case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
   case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
   }
 }
 
 MachineBasicBlock *
 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
                                     MachineBasicBlock *MBB) const {
   // We materialise the F128CSEL pseudo-instruction as some control flow and a
   // phi node:
 
   // OrigBB:
   //     [... previous instrs leading to comparison ...]
   //     b.ne TrueBB
   //     b EndBB
   // TrueBB:
   //     ; Fallthrough
   // EndBB:
   //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
 
   const TargetInstrInfo *TII =
       getTargetMachine().getSubtargetImpl()->getInstrInfo();
   MachineFunction *MF = MBB->getParent();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
   DebugLoc DL = MI->getDebugLoc();
   MachineFunction::iterator It = MBB;
   ++It;
 
   unsigned DestReg = MI->getOperand(0).getReg();
   unsigned IfTrueReg = MI->getOperand(1).getReg();
   unsigned IfFalseReg = MI->getOperand(2).getReg();
   unsigned CondCode = MI->getOperand(3).getImm();
   bool NZCVKilled = MI->getOperand(4).isKill();
 
   MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MF->insert(It, TrueBB);
   MF->insert(It, EndBB);
 
   // Transfer rest of current basic-block to EndBB
   EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
                 MBB->end());
   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
 
   BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
   BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
   MBB->addSuccessor(TrueBB);
   MBB->addSuccessor(EndBB);
 
   // TrueBB falls through to the end.
   TrueBB->addSuccessor(EndBB);
 
   if (!NZCVKilled) {
     TrueBB->addLiveIn(AArch64::NZCV);
     EndBB->addLiveIn(AArch64::NZCV);
   }
 
   BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
       .addReg(IfTrueReg)
       .addMBB(TrueBB)
       .addReg(IfFalseReg)
       .addMBB(MBB);
 
   MI->eraseFromParent();
   return EndBB;
 }
 
 MachineBasicBlock *
 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                  MachineBasicBlock *BB) const {
   switch (MI->getOpcode()) {
   default:
 #ifndef NDEBUG
     MI->dump();
 #endif
     llvm_unreachable("Unexpected instruction for custom inserter!");
 
   case AArch64::F128CSEL:
     return EmitF128CSEL(MI, BB);
 
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, BB);
   }
 }
 
 //===----------------------------------------------------------------------===//
 // AArch64 Lowering private implementation.
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // Lowering Code
 //===----------------------------------------------------------------------===//
 
 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
 /// CC
 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
   switch (CC) {
   default:
     llvm_unreachable("Unknown condition code!");
   case ISD::SETNE:
     return AArch64CC::NE;
   case ISD::SETEQ:
     return AArch64CC::EQ;
   case ISD::SETGT:
     return AArch64CC::GT;
   case ISD::SETGE:
     return AArch64CC::GE;
   case ISD::SETLT:
     return AArch64CC::LT;
   case ISD::SETLE:
     return AArch64CC::LE;
   case ISD::SETUGT:
     return AArch64CC::HI;
   case ISD::SETUGE:
     return AArch64CC::HS;
   case ISD::SETULT:
     return AArch64CC::LO;
   case ISD::SETULE:
     return AArch64CC::LS;
   }
 }
 
 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
 static void changeFPCCToAArch64CC(ISD::CondCode CC,
                                   AArch64CC::CondCode &CondCode,
                                   AArch64CC::CondCode &CondCode2) {
   CondCode2 = AArch64CC::AL;
   switch (CC) {
   default:
     llvm_unreachable("Unknown FP condition!");
   case ISD::SETEQ:
   case ISD::SETOEQ:
     CondCode = AArch64CC::EQ;
     break;
   case ISD::SETGT:
   case ISD::SETOGT:
     CondCode = AArch64CC::GT;
     break;
   case ISD::SETGE:
   case ISD::SETOGE:
     CondCode = AArch64CC::GE;
     break;
   case ISD::SETOLT:
     CondCode = AArch64CC::MI;
     break;
   case ISD::SETOLE:
     CondCode = AArch64CC::LS;
     break;
   case ISD::SETONE:
     CondCode = AArch64CC::MI;
     CondCode2 = AArch64CC::GT;
     break;
   case ISD::SETO:
     CondCode = AArch64CC::VC;
     break;
   case ISD::SETUO:
     CondCode = AArch64CC::VS;
     break;
   case ISD::SETUEQ:
     CondCode = AArch64CC::EQ;
     CondCode2 = AArch64CC::VS;
     break;
   case ISD::SETUGT:
     CondCode = AArch64CC::HI;
     break;
   case ISD::SETUGE:
     CondCode = AArch64CC::PL;
     break;
   case ISD::SETLT:
   case ISD::SETULT:
     CondCode = AArch64CC::LT;
     break;
   case ISD::SETLE:
   case ISD::SETULE:
     CondCode = AArch64CC::LE;
     break;
   case ISD::SETNE:
   case ISD::SETUNE:
     CondCode = AArch64CC::NE;
     break;
   }
 }
 
 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
 /// CC usable with the vector instructions. Fewer operations are available
 /// without a real NZCV register, so we have to use less efficient combinations
 /// to get the same effect.
 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
                                         AArch64CC::CondCode &CondCode,
                                         AArch64CC::CondCode &CondCode2,
                                         bool &Invert) {
   Invert = false;
   switch (CC) {
   default:
     // Mostly the scalar mappings work fine.
     changeFPCCToAArch64CC(CC, CondCode, CondCode2);
     break;
   case ISD::SETUO:
     Invert = true; // Fallthrough
   case ISD::SETO:
     CondCode = AArch64CC::MI;
     CondCode2 = AArch64CC::GE;
     break;
   case ISD::SETUEQ:
   case ISD::SETULT:
   case ISD::SETULE:
   case ISD::SETUGT:
   case ISD::SETUGE:
     // All of the compare-mask comparisons are ordered, but we can switch
     // between the two by a double inversion. E.g. ULE == !OGT.
     Invert = true;
     changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
     break;
   }
 }
 
 static bool isLegalArithImmed(uint64_t C) {
   // Matches AArch64DAGToDAGISel::SelectArithImmed().
   return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
 }
 
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               SDLoc dl, SelectionDAG &DAG) {
   EVT VT = LHS.getValueType();
 
   if (VT.isFloatingPoint())
     return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
 
   // The CMP instruction is just an alias for SUBS, and representing it as
   // SUBS means that it's possible to get CSE with subtract operations.
   // A later phase can perform the optimization of setting the destination
   // register to WZR/XZR if it ends up being unused.
   unsigned Opcode = AArch64ISD::SUBS;
 
   if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
       cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
     // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
     // can be set differently by this operation. It comes down to whether
     // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
     // everything is fine. If not then the optimization is wrong. Thus general
     // comparisons are only valid if op2 != 0.
 
     // So, finally, the only LLVM-native comparisons that don't mention C and V
     // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
     // the absence of information about op2.
     Opcode = AArch64ISD::ADDS;
     RHS = RHS.getOperand(1);
   } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
              cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
              !isUnsignedIntSetCC(CC)) {
     // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
     // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
     // of the signed comparisons.
     Opcode = AArch64ISD::ANDS;
     RHS = LHS.getOperand(1);
     LHS = LHS.getOperand(0);
   }
 
   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
       .getValue(1);
 }
 
 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
   SDValue Cmp;
   AArch64CC::CondCode AArch64CC;
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     EVT VT = RHS.getValueType();
     uint64_t C = RHSC->getZExtValue();
     if (!isLegalArithImmed(C)) {
       // Constant does not fit, try adjusting it by one?
       switch (CC) {
       default:
         break;
       case ISD::SETLT:
       case ISD::SETGE:
         if ((VT == MVT::i32 && C != 0x80000000 &&
              isLegalArithImmed((uint32_t)(C - 1))) ||
             (VT == MVT::i64 && C != 0x80000000ULL &&
              isLegalArithImmed(C - 1ULL))) {
           CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
           RHS = DAG.getConstant(C, VT);
         }
         break;
       case ISD::SETULT:
       case ISD::SETUGE:
         if ((VT == MVT::i32 && C != 0 &&
              isLegalArithImmed((uint32_t)(C - 1))) ||
             (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
           CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
           C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
           RHS = DAG.getConstant(C, VT);
         }
         break;
       case ISD::SETLE:
       case ISD::SETGT:
         if ((VT == MVT::i32 && C != INT32_MAX &&
              isLegalArithImmed((uint32_t)(C + 1))) ||
             (VT == MVT::i64 && C != INT64_MAX &&
              isLegalArithImmed(C + 1ULL))) {
           CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
           RHS = DAG.getConstant(C, VT);
         }
         break;
       case ISD::SETULE:
       case ISD::SETUGT:
         if ((VT == MVT::i32 && C != UINT32_MAX &&
              isLegalArithImmed((uint32_t)(C + 1))) ||
             (VT == MVT::i64 && C != UINT64_MAX &&
              isLegalArithImmed(C + 1ULL))) {
           CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
           C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
           RHS = DAG.getConstant(C, VT);
         }
         break;
       }
     }
   }
   // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
   // For the i8 operand, the largest immediate is 255, so this can be easily
   // encoded in the compare instruction. For the i16 operand, however, the
   // largest immediate cannot be encoded in the compare.
   // Therefore, use a sign extending load and cmn to avoid materializing the -1
   // constant. For example,
   // movz w1, #65535
   // ldrh w0, [x0, #0]
   // cmp w0, w1
   // >
   // ldrsh w0, [x0, #0]
   // cmn w0, #1
   // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
   // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure
   // both the LHS and RHS are truely zero extended and to make sure the
   // transformation is profitable.
   if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
     if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) &&
         isa<LoadSDNode>(LHS)) {
       if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
           cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
           LHS.getNode()->hasNUsesOfValue(1, 0)) {
         int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
         if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
           SDValue SExt =
               DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
                           DAG.getValueType(MVT::i16));
           Cmp = emitComparison(SExt,
                                DAG.getConstant(ValueofRHS, RHS.getValueType()),
                                CC, dl, DAG);
           AArch64CC = changeIntCCToAArch64CC(CC);
           AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
           return Cmp;
         }
       }
     }
   }
   Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
   AArch64CC = changeIntCCToAArch64CC(CC);
   AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
   return Cmp;
 }
 
 static std::pair<SDValue, SDValue>
 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
   assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
          "Unsupported value type");
   SDValue Value, Overflow;
   SDLoc DL(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   unsigned Opc = 0;
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Unknown overflow instruction!");
   case ISD::SADDO:
     Opc = AArch64ISD::ADDS;
     CC = AArch64CC::VS;
     break;
   case ISD::UADDO:
     Opc = AArch64ISD::ADDS;
     CC = AArch64CC::HS;
     break;
   case ISD::SSUBO:
     Opc = AArch64ISD::SUBS;
     CC = AArch64CC::VS;
     break;
   case ISD::USUBO:
     Opc = AArch64ISD::SUBS;
     CC = AArch64CC::LO;
     break;
   // Multiply needs a little bit extra work.
   case ISD::SMULO:
   case ISD::UMULO: {
     CC = AArch64CC::NE;
     bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
     if (Op.getValueType() == MVT::i32) {
       unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
       // For a 32 bit multiply with overflow check we want the instruction
       // selector to generate a widening multiply (SMADDL/UMADDL). For that we
       // need to generate the following pattern:
       // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
       LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
       RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
       SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
       SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
                                 DAG.getConstant(0, MVT::i64));
       // On AArch64 the upper 32 bits are always zero extended for a 32 bit
       // operation. We need to clear out the upper 32 bits, because we used a
       // widening multiply that wrote all 64 bits. In the end this should be a
       // noop.
       Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
       if (IsSigned) {
         // The signed overflow check requires more than just a simple check for
         // any bit set in the upper 32 bits of the result. These bits could be
         // just the sign bits of a negative number. To perform the overflow
         // check we have to arithmetic shift right the 32nd bit of the result by
         // 31 bits. Then we compare the result to the upper 32 bits.
         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
                                         DAG.getConstant(32, MVT::i64));
         UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
         SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
                                         DAG.getConstant(31, MVT::i64));
         // It is important that LowerBits is last, otherwise the arithmetic
         // shift will not be folded into the compare (SUBS).
         SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
         Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
                        .getValue(1);
       } else {
         // The overflow check for unsigned multiply is easy. We only need to
         // check if any of the upper 32 bits are set. This can be done with a
         // CMP (shifted register). For that we need to generate the following
         // pattern:
         // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
         SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
                                         DAG.getConstant(32, MVT::i64));
         SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
         Overflow =
             DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
                         UpperBits).getValue(1);
       }
       break;
     }
     assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
     // For the 64 bit multiply
     Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
     if (IsSigned) {
       SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
       SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
                                       DAG.getConstant(63, MVT::i64));
       // It is important that LowerBits is last, otherwise the arithmetic
       // shift will not be folded into the compare (SUBS).
       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
       Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
                      .getValue(1);
     } else {
       SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
       Overflow =
           DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
                       UpperBits).getValue(1);
     }
     break;
   }
   } // switch (...)
 
   if (Opc) {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
 
     // Emit the AArch64 operation with overflow check.
     Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
     Overflow = Value.getValue(1);
   }
   return std::make_pair(Value, Overflow);
 }
 
 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
                                              RTLIB::Libcall Call) const {
   SmallVector<SDValue, 2> Ops;
   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
     Ops.push_back(Op.getOperand(i));
 
   return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
                      SDLoc(Op)).first;
 }
 
 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
   SDValue Sel = Op.getOperand(0);
   SDValue Other = Op.getOperand(1);
 
   // If neither operand is a SELECT_CC, give up.
   if (Sel.getOpcode() != ISD::SELECT_CC)
     std::swap(Sel, Other);
   if (Sel.getOpcode() != ISD::SELECT_CC)
     return Op;
 
   // The folding we want to perform is:
   // (xor x, (select_cc a, b, cc, 0, -1) )
   //   -->
   // (csel x, (xor x, -1), cc ...)
   //
   // The latter will get matched to a CSINV instruction.
 
   ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
   SDValue LHS = Sel.getOperand(0);
   SDValue RHS = Sel.getOperand(1);
   SDValue TVal = Sel.getOperand(2);
   SDValue FVal = Sel.getOperand(3);
   SDLoc dl(Sel);
 
   // FIXME: This could be generalized to non-integer comparisons.
   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
     return Op;
 
   ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
   ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
 
   // The the values aren't constants, this isn't the pattern we're looking for.
   if (!CFVal || !CTVal)
     return Op;
 
   // We can commute the SELECT_CC by inverting the condition.  This
   // might be needed to make this fit into a CSINV pattern.
   if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
     std::swap(TVal, FVal);
     std::swap(CTVal, CFVal);
     CC = ISD::getSetCCInverse(CC, true);
   }
 
   // If the constants line up, perform the transform!
   if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
     SDValue CCVal;
     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
 
     FVal = Other;
     TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
                        DAG.getConstant(-1ULL, Other.getValueType()));
 
     return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
                        CCVal, Cmp);
   }
 
   return Op;
 }
 
 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
 
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
 
   unsigned Opc;
   bool ExtraOp = false;
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("Invalid code");
   case ISD::ADDC:
     Opc = AArch64ISD::ADDS;
     break;
   case ISD::SUBC:
     Opc = AArch64ISD::SUBS;
     break;
   case ISD::ADDE:
     Opc = AArch64ISD::ADCS;
     ExtraOp = true;
     break;
   case ISD::SUBE:
     Opc = AArch64ISD::SBCS;
     ExtraOp = true;
     break;
   }
 
   if (!ExtraOp)
     return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
   return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
                      Op.getOperand(2));
 }
 
 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   // Let legalize expand this if it isn't a legal type yet.
   if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
     return SDValue();
 
   AArch64CC::CondCode CC;
   // The actual operation that sets the overflow or carry flag.
   SDValue Value, Overflow;
   std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
 
   // We use 0 and 1 as false and true values.
   SDValue TVal = DAG.getConstant(1, MVT::i32);
   SDValue FVal = DAG.getConstant(0, MVT::i32);
 
   // We use an inverted condition, because the conditional select is inverted
   // too. This will allow it to be selected to a single instruction:
   // CSINC Wd, WZR, WZR, invert(cond).
   SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
   Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
                          CCVal, Overflow);
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
 }
 
 // Prefetch operands are:
 // 1: Address to prefetch
 // 2: bool isWrite
 // 3: int locality (0 = no locality ... 3 = extreme locality)
 // 4: bool isDataCache
 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
   unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
   unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
 
   bool IsStream = !Locality;
   // When the locality number is set
   if (Locality) {
     // The front-end should have filtered out the out-of-range values
     assert(Locality <= 3 && "Prefetch locality out-of-range");
     // The locality degree is the opposite of the cache speed.
     // Put the number the other way around.
     // The encoding starts at 0 for level 1
     Locality = 3 - Locality;
   }
 
   // built the mask value encoding the expected behavior.
   unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
                    (!IsData << 3) |     // IsDataCache bit
                    (Locality << 1) |    // Cache level bits
                    (unsigned)IsStream;  // Stream bit
   return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
                      DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
 }
 
 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
                                               SelectionDAG &DAG) const {
   assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
 
   RTLIB::Libcall LC;
   LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
 
   return LowerF128Call(Op, DAG, LC);
 }
 
 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
                                              SelectionDAG &DAG) const {
   if (Op.getOperand(0).getValueType() != MVT::f128) {
     // It's legal except when f128 is involved
     return Op;
   }
 
   RTLIB::Libcall LC;
   LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
 
   // FP_ROUND node has a second operand indicating whether it is known to be
   // precise. That doesn't take part in the LibCall so we can't directly use
   // LowerF128Call.
   SDValue SrcVal = Op.getOperand(0);
   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
                      /*isSigned*/ false, SDLoc(Op)).first;
 }
 
 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   // Any additional optimization in this function should be recorded
   // in the cost tables.
   EVT InVT = Op.getOperand(0).getValueType();
   EVT VT = Op.getValueType();
 
   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
     SDLoc dl(Op);
     SDValue Cv =
         DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
                     Op.getOperand(0));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
   }
 
   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
     SDLoc dl(Op);
     MVT ExtVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
                          VT.getVectorNumElements());
     SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
     return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
   }
 
   // Type changing conversions are illegal.
   return Op;
 }
 
 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
                                               SelectionDAG &DAG) const {
   if (Op.getOperand(0).getValueType().isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
 
   if (Op.getOperand(0).getValueType() != MVT::f128) {
     // It's legal except when f128 is involved
     return Op;
   }
 
   RTLIB::Libcall LC;
   if (Op.getOpcode() == ISD::FP_TO_SINT)
     LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
   else
     LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
 
   SmallVector<SDValue, 2> Ops;
   for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
     Ops.push_back(Op.getOperand(i));
 
   return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
                      SDLoc(Op)).first;
 }
 
 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
   // Any additional optimization in this function should be recorded
   // in the cost tables.
   EVT VT = Op.getValueType();
   SDLoc dl(Op);
   SDValue In = Op.getOperand(0);
   EVT InVT = In.getValueType();
 
   if (VT.getSizeInBits() < InVT.getSizeInBits()) {
     MVT CastVT =
         MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
                          InVT.getVectorNumElements());
     In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
     return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0));
   }
 
   if (VT.getSizeInBits() > InVT.getSizeInBits()) {
     unsigned CastOpc =
         Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
     EVT CastVT = VT.changeVectorElementTypeToInteger();
     In = DAG.getNode(CastOpc, dl, CastVT, In);
     return DAG.getNode(Op.getOpcode(), dl, VT, In);
   }
 
   return Op;
 }
 
 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
                                             SelectionDAG &DAG) const {
   if (Op.getValueType().isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
 
   // i128 conversions are libcalls.
   if (Op.getOperand(0).getValueType() == MVT::i128)
     return SDValue();
 
   // Other conversions are legal, unless it's to the completely software-based
   // fp128.
   if (Op.getValueType() != MVT::f128)
     return Op;
 
   RTLIB::Libcall LC;
   if (Op.getOpcode() == ISD::SINT_TO_FP)
     LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
   else
     LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
 
   return LowerF128Call(Op, DAG, LC);
 }
 
 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
                                             SelectionDAG &DAG) const {
   // For iOS, we want to call an alternative entry point: __sincos_stret,
   // which returns the values in two S / D registers.
   SDLoc dl(Op);
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
   ArgListTy Args;
   ArgListEntry Entry;
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
   Entry.isSExt = false;
   Entry.isZExt = false;
   Args.push_back(Entry);
 
   const char *LibcallName =
       (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
 
   StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
     .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0);
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
 }
 
 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
   if (Op.getValueType() != MVT::f16)
     return SDValue();
 
   assert(Op.getOperand(0).getValueType() == MVT::i16);
   SDLoc DL(Op);
 
   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
   Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
   return SDValue(
       DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
                          DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
       0);
 }
 
 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
   if (OrigVT.getSizeInBits() >= 64)
     return OrigVT;
 
   assert(OrigVT.isSimple() && "Expecting a simple value type");
 
   MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
   switch (OrigSimpleTy) {
   default: llvm_unreachable("Unexpected Vector Type");
   case MVT::v2i8:
   case MVT::v2i16:
      return MVT::v2i32;
   case MVT::v4i8:
     return  MVT::v4i16;
   }
 }
 
 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
                                                  const EVT &OrigTy,
                                                  const EVT &ExtTy,
                                                  unsigned ExtOpcode) {
   // The vector originally had a size of OrigTy. It was then extended to ExtTy.
   // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
   // 64-bits we need to insert a new extension so that it will be 64-bits.
   assert(ExtTy.is128BitVector() && "Unexpected extension size");
   if (OrigTy.getSizeInBits() >= 64)
     return N;
 
   // Must extend size to at least 64 bits to be used as an operand for VMULL.
   EVT NewVT = getExtensionTo64Bits(OrigTy);
 
   return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
 }
 
 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
                                    bool isSigned) {
   EVT VT = N->getValueType(0);
 
   if (N->getOpcode() != ISD::BUILD_VECTOR)
     return false;
 
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     SDNode *Elt = N->getOperand(i).getNode();
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
       unsigned HalfSize = EltSize / 2;
       if (isSigned) {
         if (!isIntN(HalfSize, C->getSExtValue()))
           return false;
       } else {
         if (!isUIntN(HalfSize, C->getZExtValue()))
           return false;
       }
       continue;
     }
     return false;
   }
 
   return true;
 }
 
 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
     return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
                                              N->getOperand(0)->getValueType(0),
                                              N->getValueType(0),
                                              N->getOpcode());
 
   assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
   EVT VT = N->getValueType(0);
   unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2;
   unsigned NumElts = VT.getVectorNumElements();
   MVT TruncVT = MVT::getIntegerVT(EltSize);
   SmallVector<SDValue, 8> Ops;
   for (unsigned i = 0; i != NumElts; ++i) {
     ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
     const APInt &CInt = C->getAPIntValue();
     // Element types smaller than 32 bits are not legal, so use i32 elements.
     // The values are implicitly truncated so sext vs. zext doesn't matter.
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
   }
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
                      MVT::getVectorVT(TruncVT, NumElts), Ops);
 }
 
 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() == ISD::SIGN_EXTEND)
     return true;
   if (isExtendedBUILD_VECTOR(N, DAG, true))
     return true;
   return false;
 }
 
 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
   if (N->getOpcode() == ISD::ZERO_EXTEND)
     return true;
   if (isExtendedBUILD_VECTOR(N, DAG, false))
     return true;
   return false;
 }
 
 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
   unsigned Opcode = N->getOpcode();
   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
     SDNode *N0 = N->getOperand(0).getNode();
     SDNode *N1 = N->getOperand(1).getNode();
     return N0->hasOneUse() && N1->hasOneUse() &&
       isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
   }
   return false;
 }
 
 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
   unsigned Opcode = N->getOpcode();
   if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
     SDNode *N0 = N->getOperand(0).getNode();
     SDNode *N1 = N->getOperand(1).getNode();
     return N0->hasOneUse() && N1->hasOneUse() &&
       isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
   }
   return false;
 }
 
 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   // Multiplications are only custom-lowered for 128-bit vectors so that
   // VMULL can be detected.  Otherwise v2i64 multiplications are not legal.
   EVT VT = Op.getValueType();
   assert(VT.is128BitVector() && VT.isInteger() &&
          "unexpected type for custom-lowering ISD::MUL");
   SDNode *N0 = Op.getOperand(0).getNode();
   SDNode *N1 = Op.getOperand(1).getNode();
   unsigned NewOpc = 0;
   bool isMLA = false;
   bool isN0SExt = isSignExtended(N0, DAG);
   bool isN1SExt = isSignExtended(N1, DAG);
   if (isN0SExt && isN1SExt)
     NewOpc = AArch64ISD::SMULL;
   else {
     bool isN0ZExt = isZeroExtended(N0, DAG);
     bool isN1ZExt = isZeroExtended(N1, DAG);
     if (isN0ZExt && isN1ZExt)
       NewOpc = AArch64ISD::UMULL;
     else if (isN1SExt || isN1ZExt) {
       // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
       // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
       if (isN1SExt && isAddSubSExt(N0, DAG)) {
         NewOpc = AArch64ISD::SMULL;
         isMLA = true;
       } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
         NewOpc =  AArch64ISD::UMULL;
         isMLA = true;
       } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
         std::swap(N0, N1);
         NewOpc =  AArch64ISD::UMULL;
         isMLA = true;
       }
     }
 
     if (!NewOpc) {
       if (VT == MVT::v2i64)
         // Fall through to expand this.  It is not legal.
         return SDValue();
       else
         // Other vector multiplications are legal.
         return Op;
     }
   }
 
   // Legalize to a S/UMULL instruction
   SDLoc DL(Op);
   SDValue Op0;
   SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
   if (!isMLA) {
     Op0 = skipExtensionForVectorMULL(N0, DAG);
     assert(Op0.getValueType().is64BitVector() &&
            Op1.getValueType().is64BitVector() &&
            "unexpected types for extended operands to VMULL");
     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
   }
   // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
   // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
   // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
   SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
   SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
   EVT Op1VT = Op1.getValueType();
   return DAG.getNode(N0->getOpcode(), DL, VT,
                      DAG.getNode(NewOpc, DL, VT,
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
                      DAG.getNode(NewOpc, DL, VT,
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("unimplemented operand");
     return SDValue();
   case ISD::BITCAST:
     return LowerBITCAST(Op, DAG);
   case ISD::GlobalAddress:
     return LowerGlobalAddress(Op, DAG);
   case ISD::GlobalTLSAddress:
     return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SETCC:
     return LowerSETCC(Op, DAG);
   case ISD::BR_CC:
     return LowerBR_CC(Op, DAG);
   case ISD::SELECT:
     return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:
     return LowerSELECT_CC(Op, DAG);
   case ISD::JumpTable:
     return LowerJumpTable(Op, DAG);
   case ISD::ConstantPool:
     return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:
     return LowerBlockAddress(Op, DAG);
   case ISD::VASTART:
     return LowerVASTART(Op, DAG);
   case ISD::VACOPY:
     return LowerVACOPY(Op, DAG);
   case ISD::VAARG:
     return LowerVAARG(Op, DAG);
   case ISD::ADDC:
   case ISD::ADDE:
   case ISD::SUBC:
   case ISD::SUBE:
     return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
   case ISD::USUBO:
   case ISD::SMULO:
   case ISD::UMULO:
     return LowerXALUO(Op, DAG);
   case ISD::FADD:
     return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
   case ISD::FSUB:
     return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
   case ISD::FMUL:
     return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
   case ISD::FDIV:
     return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
   case ISD::FP_ROUND:
     return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND:
     return LowerFP_EXTEND(Op, DAG);
   case ISD::FRAMEADDR:
     return LowerFRAMEADDR(Op, DAG);
   case ISD::RETURNADDR:
     return LowerRETURNADDR(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:
     return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT:
     return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::BUILD_VECTOR:
     return LowerBUILD_VECTOR(Op, DAG);
   case ISD::VECTOR_SHUFFLE:
     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
     return LowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:
     return LowerVectorSRA_SRL_SHL(Op, DAG);
   case ISD::SHL_PARTS:
     return LowerShiftLeftParts(Op, DAG);
   case ISD::SRL_PARTS:
   case ISD::SRA_PARTS:
     return LowerShiftRightParts(Op, DAG);
   case ISD::CTPOP:
     return LowerCTPOP(Op, DAG);
   case ISD::FCOPYSIGN:
     return LowerFCOPYSIGN(Op, DAG);
   case ISD::AND:
     return LowerVectorAND(Op, DAG);
   case ISD::OR:
     return LowerVectorOR(Op, DAG);
   case ISD::XOR:
     return LowerXOR(Op, DAG);
   case ISD::PREFETCH:
     return LowerPREFETCH(Op, DAG);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
     return LowerINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return LowerFP_TO_INT(Op, DAG);
   case ISD::FSINCOS:
     return LowerFSINCOS(Op, DAG);
   case ISD::MUL:
     return LowerMUL(Op, DAG);
   }
 }
 
 /// getFunctionAlignment - Return the Log2 alignment of this function.
 unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
   return 2;
 }
 
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
 #include "AArch64GenCallingConv.inc"
 
 /// Selects the correct CCAssignFn for a given CallingConvention value.
 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
                                                      bool IsVarArg) const {
   switch (CC) {
   default:
     llvm_unreachable("Unsupported calling convention.");
   case CallingConv::WebKit_JS:
     return CC_AArch64_WebKit_JS;
   case CallingConv::GHC:
     return CC_AArch64_GHC;
   case CallingConv::C:
   case CallingConv::Fast:
     if (!Subtarget->isTargetDarwin())
       return CC_AArch64_AAPCS;
     return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
   }
 }
 
 SDValue AArch64TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
     SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
   // At this point, Ins[].VT may already be promoted to i32. To correctly
   // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
   // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
   // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
   // we use a special version of AnalyzeFormalArguments to pass in ValVT and
   // LocVT.
   unsigned NumArgs = Ins.size();
   Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
   unsigned CurArgIdx = 0;
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ValVT = Ins[i].VT;
     std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
     CurArgIdx = Ins[i].OrigArgIndex;
 
     // Get type of the original argument.
     EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
     MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
     // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
     if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
       ValVT = MVT::i8;
     else if (ActualMVT == MVT::i16)
       ValVT = MVT::i16;
 
     CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
     bool Res =
         AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
     assert(!Res && "Call operand has unhandled type");
     (void)Res;
   }
   assert(ArgLocs.size() == Ins.size());
   SmallVector<SDValue, 16> ArgValues;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
 
     if (Ins[i].Flags.isByVal()) {
       // Byval is used for HFAs in the PCS, but the system should work in a
       // non-compliant manner for larger structs.
       EVT PtrTy = getPointerTy();
       int Size = Ins[i].Flags.getByValSize();
       unsigned NumRegs = (Size + 7) / 8;
 
       // FIXME: This works on big-endian for composite byvals, which are the common
       // case. It should also work for fundamental types too.
       unsigned FrameIdx =
         MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
       SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
       InVals.push_back(FrameIdxN);
 
       continue;
     }
     
     if (VA.isRegLoc()) {
       // Arguments stored in registers.
       EVT RegVT = VA.getLocVT();
 
       SDValue ArgValue;
       const TargetRegisterClass *RC;
 
       if (RegVT == MVT::i32)
         RC = &AArch64::GPR32RegClass;
       else if (RegVT == MVT::i64)
         RC = &AArch64::GPR64RegClass;
       else if (RegVT == MVT::f16)
         RC = &AArch64::FPR16RegClass;
       else if (RegVT == MVT::f32)
         RC = &AArch64::FPR32RegClass;
       else if (RegVT == MVT::f64 || RegVT.is64BitVector())
         RC = &AArch64::FPR64RegClass;
       else if (RegVT == MVT::f128 || RegVT.is128BitVector())
         RC = &AArch64::FPR128RegClass;
       else
         llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
       // Transform the arguments in physical registers into virtual ones.
       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
       ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
       // If this is an 8, 16 or 32-bit value, it is really passed promoted
       // to 64 bits.  Insert an assert[sz]ext to capture this, then
       // truncate to the right size.
       switch (VA.getLocInfo()) {
       default:
         llvm_unreachable("Unknown loc info!");
       case CCValAssign::Full:
         break;
       case CCValAssign::BCvt:
         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
         break;
       case CCValAssign::AExt:
       case CCValAssign::SExt:
       case CCValAssign::ZExt:
         // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
         // nodes after our lowering.
         assert(RegVT == Ins[i].VT && "incorrect register location selected");
         break;
       }
 
       InVals.push_back(ArgValue);
 
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
       unsigned ArgOffset = VA.getLocMemOffset();
       unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
 
       uint32_t BEAlign = 0;
       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
           !Ins[i].Flags.isInConsecutiveRegs())
         BEAlign = 8 - ArgSize;
 
       int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
 
       // Create load nodes to retrieve arguments from the stack.
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
       SDValue ArgValue;
 
       // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
       ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
       MVT MemVT = VA.getValVT();
 
       switch (VA.getLocInfo()) {
       default:
         break;
       case CCValAssign::BCvt:
         MemVT = VA.getLocVT();
         break;
       case CCValAssign::SExt:
         ExtType = ISD::SEXTLOAD;
         break;
       case CCValAssign::ZExt:
         ExtType = ISD::ZEXTLOAD;
         break;
       case CCValAssign::AExt:
         ExtType = ISD::EXTLOAD;
         break;
       }
 
       ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
                                 MachinePointerInfo::getFixedStack(FI),
                                 MemVT, false, false, false, 0);
 
       InVals.push_back(ArgValue);
     }
   }
 
   // varargs
   if (isVarArg) {
     if (!Subtarget->isTargetDarwin()) {
       // The AAPCS variadic function ABI is identical to the non-variadic
       // one. As a result there may be more arguments in registers and we should
       // save them for future reference.
       saveVarArgRegisters(CCInfo, DAG, DL, Chain);
     }
 
     AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     // This will point to the next argument passed via stack.
     unsigned StackOffset = CCInfo.getNextStackOffset();
     // We currently pass all varargs at 8-byte alignment.
     StackOffset = ((StackOffset + 7) & ~7);
     AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
   }
 
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   unsigned StackArgSize = CCInfo.getNextStackOffset();
   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
   if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
     // This is a non-standard ABI so by fiat I say we're allowed to make full
     // use of the stack area to be popped, which must be aligned to 16 bytes in
     // any case:
     StackArgSize = RoundUpToAlignment(StackArgSize, 16);
 
     // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
     // a multiple of 16.
     FuncInfo->setArgumentStackToRestore(StackArgSize);
 
     // This realignment carries over to the available bytes below. Our own
     // callers will guarantee the space is free by giving an aligned value to
     // CALLSEQ_START.
   }
   // Even if we're not expected to free up the space, it's useful to know how
   // much is there while considering tail calls (because we can reuse it).
   FuncInfo->setBytesInStackArgArea(StackArgSize);
 
   return Chain;
 }
 
 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
                                                 SelectionDAG &DAG, SDLoc DL,
                                                 SDValue &Chain) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
   SmallVector<SDValue, 8> MemOps;
 
   static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
                                           AArch64::X3, AArch64::X4, AArch64::X5,
                                           AArch64::X6, AArch64::X7 };
   static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
   unsigned FirstVariadicGPR =
       CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
 
   unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
   int GPRIdx = 0;
   if (GPRSaveSize != 0) {
     GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
 
     SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
 
     for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
       unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
       SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
       SDValue Store =
           DAG.getStore(Val.getValue(1), DL, Val, FIN,
                        MachinePointerInfo::getStack(i * 8), false, false, 0);
       MemOps.push_back(Store);
       FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
                         DAG.getConstant(8, getPointerTy()));
     }
   }
   FuncInfo->setVarArgsGPRIndex(GPRIdx);
   FuncInfo->setVarArgsGPRSize(GPRSaveSize);
 
   if (Subtarget->hasFPARMv8()) {
     static const MCPhysReg FPRArgRegs[] = {
         AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
         AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
     static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
     unsigned FirstVariadicFPR =
         CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
 
     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
     int FPRIdx = 0;
     if (FPRSaveSize != 0) {
       FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
 
       SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
 
       for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
         unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
 
         SDValue Store =
             DAG.getStore(Val.getValue(1), DL, Val, FIN,
                          MachinePointerInfo::getStack(i * 16), false, false, 0);
         MemOps.push_back(Store);
         FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
                           DAG.getConstant(16, getPointerTy()));
       }
     }
     FuncInfo->setVarArgsFPRIndex(FPRIdx);
     FuncInfo->setVarArgsFPRSize(FPRSaveSize);
   }
 
   if (!MemOps.empty()) {
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
   }
 }
 
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 SDValue AArch64TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
     SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
     SDValue ThisVal) const {
   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, RetCC);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign VA = RVLocs[i];
 
     // Pass 'this' value directly from the argument to return value, to avoid
     // reg unit interference
     if (i == 0 && isThisReturn) {
       assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
              "unexpected return calling convention register assignment");
       InVals.push_back(ThisVal);
       continue;
     }
 
     SDValue Val =
         DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
     Chain = Val.getValue(1);
     InFlag = Val.getValue(2);
 
     switch (VA.getLocInfo()) {
     default:
       llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full:
       break;
     case CCValAssign::BCvt:
       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
       break;
     }
 
     InVals.push_back(Val);
   }
 
   return Chain;
 }
 
 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
     SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
     bool isCalleeStructRet, bool isCallerStructRet,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   // For CallingConv::C this function knows whether the ABI needs
   // changing. That's not true for other conventions so they will have to opt in
   // manually.
   if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
     return false;
 
   const MachineFunction &MF = DAG.getMachineFunction();
   const Function *CallerF = MF.getFunction();
   CallingConv::ID CallerCC = CallerF->getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
 
   // Byval parameters hand the function a pointer directly into the stack area
   // we want to reuse during a tail call. Working around this *is* possible (see
   // X86) but less efficient and uglier in LowerCall.
   for (Function::const_arg_iterator i = CallerF->arg_begin(),
                                     e = CallerF->arg_end();
        i != e; ++i)
     if (i->hasByValAttr())
       return false;
 
   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
     if (IsTailCallConvention(CalleeCC) && CCMatch)
       return true;
     return false;
   }
 
   // Externally-defined functions with weak linkage should not be
   // tail-called on AArch64 when the OS does not support dynamic
   // pre-emption of symbols, as the AAELF spec requires normal calls
   // to undefined weak functions to be replaced with a NOP or jump to the
   // next instruction. The behaviour of branch instructions in this
   // situation (as used for tail calls) is implementation-defined, so we
   // cannot rely on the linker replacing the tail call with a return.
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
     const Triple TT(getTargetMachine().getTargetTriple());
     if (GV->hasExternalWeakLinkage() &&
         (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
       return false;
   }
 
   // Now we search for cases where we can use a tail call without changing the
   // ABI. Sibcall is used in some places (particularly gcc) to refer to this
   // concept.
 
   // I want anyone implementing a new calling convention to think long and hard
   // about this assert.
   assert((!isVarArg || CalleeCC == CallingConv::C) &&
          "Unexpected variadic calling convention");
 
   if (isVarArg && !Outs.empty()) {
     // At least two cases here: if caller is fastcc then we can't have any
     // memory arguments (we'd be expected to clean up the stack afterwards). If
     // caller is C then we could potentially use its argument area.
 
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
     SmallVector<CCValAssign, 16> ArgLocs;
     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
                    *DAG.getContext());
 
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
       if (!ArgLocs[i].isRegLoc())
         return false;
   }
 
   // If the calling conventions do not match, then we'd better make sure the
   // results are returned in the same way as what the caller expects.
   if (!CCMatch) {
     SmallVector<CCValAssign, 16> RVLocs1;
     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
                     *DAG.getContext());
     CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
 
     SmallVector<CCValAssign, 16> RVLocs2;
     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
                     *DAG.getContext());
     CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
 
     if (RVLocs1.size() != RVLocs2.size())
       return false;
     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
         return false;
       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
         return false;
       if (RVLocs1[i].isRegLoc()) {
         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
           return false;
       } else {
         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
           return false;
       }
     }
   }
 
   // Nothing more to check if the callee is taking no arguments
   if (Outs.empty())
     return true;
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
 
   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
   // If the stack arguments for this call would fit into our own save area then
   // the call can be made tail.
   return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
 }
 
 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
                                                    SelectionDAG &DAG,
                                                    MachineFrameInfo *MFI,
                                                    int ClobberedFI) const {
   SmallVector<SDValue, 8> ArgChains;
   int64_t FirstByte = MFI->getObjectOffset(ClobberedFI);
   int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1;
 
   // Include the original chain at the beginning of the list. When this is
   // used by target LowerCall hooks, this helps legalize find the
   // CALLSEQ_BEGIN node.
   ArgChains.push_back(Chain);
 
   // Add a chain value for each stack argument corresponding
   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
                             UE = DAG.getEntryNode().getNode()->use_end();
        U != UE; ++U)
     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
         if (FI->getIndex() < 0) {
           int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex());
           int64_t InLastByte = InFirstByte;
           InLastByte += MFI->getObjectSize(FI->getIndex()) - 1;
 
           if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
               (FirstByte <= InFirstByte && InFirstByte <= LastByte))
             ArgChains.push_back(SDValue(L, 1));
         }
 
   // Build a tokenfactor for all the chains.
   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
 }
 
 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
                                                    bool TailCallOpt) const {
   return CallCC == CallingConv::Fast && TailCallOpt;
 }
 
 bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
   return CallCC == CallingConv::Fast;
 }
 
 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
 /// and add input and output parameter nodes.
 SDValue
 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
   SDLoc &DL = CLI.DL;
   SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
   SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
   bool &IsTailCall = CLI.IsTailCall;
   CallingConv::ID CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
   bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool IsThisReturn = false;
 
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
   bool IsSibCall = false;
 
   if (IsTailCall) {
     // Check if it's really possible to do a tail call.
     IsTailCall = isEligibleForTailCallOptimization(
         Callee, CallConv, IsVarArg, IsStructRet,
         MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
     if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
       report_fatal_error("failed to perform tail call elimination on a call "
                          "site marked musttail");
 
     // A sibling call is one where we're under the usual C ABI and not planning
     // to change that but can still do a tail call:
     if (!TailCallOpt && IsTailCall)
       IsSibCall = true;
 
     if (IsTailCall)
       ++NumTailCalls;
   }
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
   if (IsVarArg) {
     // Handle fixed and variable vector arguments differently.
     // Variable vector arguments always go into memory.
     unsigned NumArgs = Outs.size();
 
     for (unsigned i = 0; i != NumArgs; ++i) {
       MVT ArgVT = Outs[i].VT;
       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
                                                /*IsVarArg=*/ !Outs[i].IsFixed);
       bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
       assert(!Res && "Call operand has unhandled type");
       (void)Res;
     }
   } else {
     // At this point, Outs[].VT may already be promoted to i32. To correctly
     // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
     // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
     // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
     // we use a special version of AnalyzeCallOperands to pass in ValVT and
     // LocVT.
     unsigned NumArgs = Outs.size();
     for (unsigned i = 0; i != NumArgs; ++i) {
       MVT ValVT = Outs[i].VT;
       // Get type of the original argument.
       EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
                                   /*AllowUnknown*/ true);
       MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
       ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
       // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
       if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
         ValVT = MVT::i8;
       else if (ActualMVT == MVT::i16)
         ValVT = MVT::i16;
 
       CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
       bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
       assert(!Res && "Call operand has unhandled type");
       (void)Res;
     }
   }
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
   if (IsSibCall) {
     // Since we're not changing the ABI to make this a tail call, the memory
     // operands are already available in the caller's incoming argument space.
     NumBytes = 0;
   }
 
   // FPDiff is the byte offset of the call's argument area from the callee's.
   // Stores to callee stack arguments will be placed in FixedStackSlots offset
   // by this amount for a tail call. In a sibling call it must be 0 because the
   // caller will deallocate the entire stack and the callee still expects its
   // arguments to begin at SP+0. Completely unused for non-tail calls.
   int FPDiff = 0;
 
   if (IsTailCall && !IsSibCall) {
     unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
 
     // Since callee will pop argument stack as a tail call, we must keep the
     // popped size 16-byte aligned.
     NumBytes = RoundUpToAlignment(NumBytes, 16);
 
     // FPDiff will be negative if this tail call requires more space than we
     // would automatically have in our incoming argument space. Positive if we
     // can actually shrink the stack.
     FPDiff = NumReusableBytes - NumBytes;
 
     // The stack pointer must be 16-byte aligned at all times it's used for a
     // memory operation, which in practice means at *all* times and in
     // particular across call boundaries. Therefore our own arguments started at
     // a 16-byte aligned SP and the delta applied for the tail call should
     // satisfy the same constraint.
     assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
   }
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall)
     Chain =
         DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
        ++i, ++realArgIdx) {
     CCValAssign &VA = ArgLocs[i];
     SDValue Arg = OutVals[realArgIdx];
     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
     default:
       llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full:
       break;
     case CCValAssign::SExt:
       Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::ZExt:
       Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::AExt:
       if (Outs[realArgIdx].ArgVT == MVT::i1) {
         // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
       }
       Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
       break;
     case CCValAssign::FPExt:
       Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
       break;
     }
 
     if (VA.isRegLoc()) {
       if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
         assert(VA.getLocVT() == MVT::i64 &&
                "unexpected calling convention register assignment");
         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
                "unexpected use of 'returned'");
         IsThisReturn = true;
       }
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else {
       assert(VA.isMemLoc());
 
       SDValue DstAddr;
       MachinePointerInfo DstInfo;
 
       // FIXME: This works on big-endian for composite byvals, which are the
       // common case. It should also work for fundamental types too.
       uint32_t BEAlign = 0;
       unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
                                         : VA.getValVT().getSizeInBits();
       OpSize = (OpSize + 7) / 8;
       if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
           !Flags.isInConsecutiveRegs()) {
         if (OpSize < 8)
           BEAlign = 8 - OpSize;
       }
       unsigned LocMemOffset = VA.getLocMemOffset();
       int32_t Offset = LocMemOffset + BEAlign;
       SDValue PtrOff = DAG.getIntPtrConstant(Offset);
       PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
 
       if (IsTailCall) {
         Offset = Offset + FPDiff;
         int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
 
         DstAddr = DAG.getFrameIndex(FI, getPointerTy());
         DstInfo = MachinePointerInfo::getFixedStack(FI);
 
         // Make sure any stack arguments overlapping with where we're storing
         // are loaded before this eventual operation. Otherwise they'll be
         // clobbered.
         Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
       } else {
         SDValue PtrOff = DAG.getIntPtrConstant(Offset);
 
         DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
         DstInfo = MachinePointerInfo::getStack(LocMemOffset);
       }
 
       if (Outs[i].Flags.isByVal()) {
         SDValue SizeNode =
             DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
         SDValue Cpy = DAG.getMemcpy(
             Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
             /*isVol = */ false,
             /*AlwaysInline = */ false, DstInfo, MachinePointerInfo());
 
         MemOpChains.push_back(Cpy);
       } else {
         // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
         // promoted to a legal register type i32, we should truncate Arg back to
         // i1/i8/i16.
         if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
             VA.getValVT() == MVT::i16)
           Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
 
         SDValue Store =
             DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
         MemOpChains.push_back(Store);
       }
     }
   }
 
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
     Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
                              RegsToPass[i].second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       Subtarget->isTargetMachO()) {
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
       const GlobalValue *GV = G->getGlobal();
       bool InternalLinkage = GV->hasInternalLinkage();
       if (InternalLinkage)
         Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
       else {
         Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
                                             AArch64II::MO_GOT);
         Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
       }
     } else if (ExternalSymbolSDNode *S =
                    dyn_cast<ExternalSymbolSDNode>(Callee)) {
       const char *Sym = S->getSymbol();
       Callee =
           DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
     }
   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     const GlobalValue *GV = G->getGlobal();
     Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const char *Sym = S->getSymbol();
     Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
   }
 
   // We don't usually want to end the call-sequence here because we would tidy
   // the frame up *after* the call, however in the ABI-changing tail-call case
   // we've carefully laid out the parameters so that when sp is reset they'll be
   // in the correct location.
   if (IsTailCall && !IsSibCall) {
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                                DAG.getIntPtrConstant(0, true), InFlag, DL);
     InFlag = Chain.getValue(1);
   }
 
   std::vector<SDValue> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Callee);
 
   if (IsTailCall) {
     // Each tail call may have to adjust the stack by a different amount, so
     // this information must travel along with the operation for eventual
     // consumption by emitEpilogue.
     Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
   }
 
   // Add argument registers to the end of the list so that they are known live
   // into the call.
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
   const uint32_t *Mask;
   const TargetRegisterInfo *TRI =
       getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const AArch64RegisterInfo *ARI =
       static_cast<const AArch64RegisterInfo *>(TRI);
   if (IsThisReturn) {
     // For 'this' returns, use the X0-preserving mask if applicable
     Mask = ARI->getThisReturnPreservedMask(CallConv);
     if (!Mask) {
       IsThisReturn = false;
       Mask = ARI->getCallPreservedMask(CallConv);
     }
   } else
     Mask = ARI->getCallPreservedMask(CallConv);
 
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
   // If we're doing a tall call, use a TC_RETURN here rather than an
   // actual call instruction.
   if (IsTailCall)
     return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
                                 ? RoundUpToAlignment(NumBytes, 16)
                                 : 0;
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                              DAG.getIntPtrConstant(CalleePopBytes, true),
                              InFlag, DL);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
   return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
                          InVals, IsThisReturn,
                          IsThisReturn ? OutVals[0] : SDValue());
 }
 
 bool AArch64TargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
   return CCInfo.CheckReturn(Outs, RetCC);
 }
 
 SDValue
 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                    bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
                                    SDLoc DL, SelectionDAG &DAG) const {
   CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
                           ? RetCC_AArch64_WebKit_JS
                           : RetCC_AArch64_AAPCS;
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC);
 
   // Copy the result values into the output registers.
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
   for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
        ++i, ++realRVLocIdx) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
     SDValue Arg = OutVals[realRVLocIdx];
 
     switch (VA.getLocInfo()) {
     default:
       llvm_unreachable("Unknown loc info!");
     case CCValAssign::Full:
       if (Outs[i].ArgVT == MVT::i1) {
         // AAPCS requires i1 to be zero-extended to i8 by the producer of the
         // value. This is strictly redundant on Darwin (which uses "zeroext
         // i1"), but will be optimised out before ISel.
         Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
         Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
       }
       break;
     case CCValAssign::BCvt:
       Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
       break;
     }
 
     Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
     Flag = Chain.getValue(1);
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
   RetOps[0] = Chain; // Update chain.
 
   // Add the flag if we have it.
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
   return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
 //===----------------------------------------------------------------------===//
 //  Other Lowering Code
 //===----------------------------------------------------------------------===//
 
 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
                                                   SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
   SDLoc DL(Op);
   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
   unsigned char OpFlags =
       Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
 
   assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
          "unexpected offset in global node");
 
   // This also catched the large code model case for Darwin.
   if ((OpFlags & AArch64II::MO_GOT) != 0) {
     SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
     // FIXME: Once remat is capable of dealing with instructions with register
     // operands, expand this into two nodes instead of using a wrapper node.
     return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
   }
 
   if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) {
     assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
            "use of MO_CONSTPOOL only supported on small model");
     SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE);
     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
     unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
     SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags);
     SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
     SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr,
                                      MachinePointerInfo::getConstantPool(),
                                      /*isVolatile=*/ false,
                                      /*isNonTemporal=*/ true,
                                      /*isInvariant=*/ true, 8);
     if (GN->getOffset() != 0)
       return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr,
                          DAG.getConstant(GN->getOffset(), PtrVT));
     return GlobalAddr;
   }
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     const unsigned char MO_NC = AArch64II::MO_NC;
     return DAG.getNode(
         AArch64ISD::WrapperLarge, DL, PtrVT,
         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
   } else {
     // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
     // the only correct model on Darwin.
     SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
                                             OpFlags | AArch64II::MO_PAGE);
     unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
     SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
 
     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
     return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
   }
 }
 
 /// \brief Convert a TLS address reference into the correct sequence of loads
 /// and calls to compute the variable's address (for Darwin, currently) and
 /// return an SDValue containing the final node.
 
 /// Darwin only has one TLS scheme which must be capable of dealing with the
 /// fully general situation, in the worst case. This means:
 ///     + "extern __thread" declaration.
 ///     + Defined in a possibly unknown dynamic library.
 ///
 /// The general system is that each __thread variable has a [3 x i64] descriptor
 /// which contains information used by the runtime to calculate the address. The
 /// only part of this the compiler needs to know about is the first xword, which
 /// contains a function pointer that must be called with the address of the
 /// entire descriptor in "x0".
 ///
 /// Since this descriptor may be in a different unit, in general even the
 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
 /// is:
 ///     adrp x0, _var@TLVPPAGE
 ///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
 ///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
 ///                                      ; the function pointer
 ///     blr x1                           ; Uses descriptor address in x0
 ///     ; Address of _var is now in x0.
 ///
 /// If the address of _var's descriptor *is* known to the linker, then it can
 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
 /// a slight efficiency gain.
 SDValue
 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
                                                    SelectionDAG &DAG) const {
   assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
 
   SDLoc DL(Op);
   MVT PtrVT = getPointerTy();
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
   SDValue TLVPAddr =
       DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
   SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
 
   // The first entry in the descriptor is a function pointer that we must call
   // to obtain the address of the variable.
   SDValue Chain = DAG.getEntryNode();
   SDValue FuncTLVGet =
       DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
                   false, true, true, 8);
   Chain = FuncTLVGet.getValue(1);
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setAdjustsStack(true);
 
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   // silly).
   const TargetRegisterInfo *TRI =
       getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   const AArch64RegisterInfo *ARI =
       static_cast<const AArch64RegisterInfo *>(TRI);
   const uint32_t *Mask = ARI->getTLSCallPreservedMask();
 
   // Finally, we can make the call. This is just a degenerate version of a
   // normal AArch64 call node: x0 takes the address of the descriptor, and
   // returns the address of the variable in this thread.
   Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
   Chain =
       DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
                   Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
                   DAG.getRegisterMask(Mask), Chain.getValue(1));
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
 }
 
 /// When accessing thread-local variables under either the general-dynamic or
 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
-/// is a function pointer to carry out the resolution. This function takes the
-/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
-/// other registers (except LR, NZCV) are preserved.
+/// is a function pointer to carry out the resolution.
 ///
-/// Thus, the ideal call sequence on AArch64 is:
+/// The sequence is:
+///    adrp  x0, :tlsdesc:var
+///    ldr   x1, [x0, #:tlsdesc_lo12:var]
+///    add   x0, x0, #:tlsdesc_lo12:var
+///    .tlsdesccall var
+///    blr   x1
+///    (TPIDR_EL0 offset now in x0)
 ///
-///     adrp x0, :tlsdesc:thread_var
-///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
-///     add x0, x0, :tlsdesc_lo12:thread_var
-///     .tlsdesccall thread_var
-///     blr x8
-///     (TPIDR_EL0 offset now in x0).
-///
-/// The ".tlsdesccall" directive instructs the assembler to insert a particular
-/// relocation to help the linker relax this sequence if it turns out to be too
-/// conservative.
-///
-/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
-/// is harmless.
-SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
-                                                   SDValue DescAddr, SDLoc DL,
-                                                   SelectionDAG &DAG) const {
+///  The above sequence must be produced unscheduled, to enable the linker to
+///  optimize/relax this sequence.
+///  Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
+///  above sequence, and expanded really late in the compilation flow, to ensure
+///  the sequence is produced as per above.
+SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+                                                      SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
 
-  // The function we need to call is simply the first entry in the GOT for this
-  // descriptor, load it in preparation.
-  SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
+  SDValue Chain = DAG.getEntryNode();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
-  const AArch64RegisterInfo *ARI =
-      static_cast<const AArch64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // The function takes only one argument: the address of the descriptor itself
-  // in X0.
-  SDValue Glue, Chain;
-  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
-  Glue = Chain.getValue(1);
-
-  // We're now ready to populate the argument list, as with a normal call:
-  SmallVector<SDValue, 6> Ops;
+  SmallVector<SDValue, 2> Ops;
   Ops.push_back(Chain);
-  Ops.push_back(Func);
   Ops.push_back(SymAddr);
-  Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT));
-  Ops.push_back(DAG.getRegisterMask(Mask));
-  Ops.push_back(Glue);
 
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
-  Glue = Chain.getValue(1);
+  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, Ops);
+  SDValue Glue = Chain.getValue(1);
 
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
 }
 
 SDValue
 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
   assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
          "ELF TLS only supported in small memory model");
+  // Different choices can be made for the maximum size of the TLS area for a
+  // module. For the small address model, the default TLS size is 16MiB and the
+  // maximum TLS size is 4GiB.
+  // FIXME: add -mtls-size command line option and make it control the 16MiB
+  // vs. 4GiB code sequence generation.
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
+  if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
+    if (Model == TLSModel::LocalDynamic)
+      Model = TLSModel::GeneralDynamic;
+  }
 
   SDValue TPOff;
   EVT PtrVT = getPointerTy();
   SDLoc DL(Op);
   const GlobalValue *GV = GA->getGlobal();
 
   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
 
   if (Model == TLSModel::LocalExec) {
     SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
     SDValue LoVar = DAG.getTargetGlobalAddress(
         GV, DL, PtrVT, 0,
-        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
 
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(16, MVT::i32)),
-                    0);
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)),
-                    0);
+    SDValue TPWithOff_lo =
+        SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
+                                   HiVar, DAG.getTargetConstant(0, MVT::i32)),
+                0);
+    SDValue TPWithOff =
+        SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
+                                   LoVar, DAG.getTargetConstant(0, MVT::i32)),
+                0);
+    return TPWithOff;
   } else if (Model == TLSModel::InitialExec) {
     TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
     TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
   } else if (Model == TLSModel::LocalDynamic) {
     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
     // the beginning of the module's TLS region, followed by a DTPREL offset
     // calculation.
 
     // These accesses will need deduplicating if there's more than one.
     AArch64FunctionInfo *MFI =
         DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
     MFI->incNumLocalDynamicTLSAccesses();
 
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT,
-        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
     // The call needs a relocation too for linker relaxation. It doesn't make
     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
     // the address.
     SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
                                                   AArch64II::MO_TLS);
 
     // Now we can calculate the offset from TPIDR_EL0 to this module's
     // thread-local area.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
 
     // Now use :dtprel_whatever: operations to calculate this variable's offset
     // in its thread-storage area.
     SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
     SDValue LoVar = DAG.getTargetGlobalAddress(
         GV, DL, MVT::i64, 0,
-        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
-
-    SDValue DTPOff =
-        SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
-                                   DAG.getTargetConstant(16, MVT::i32)),
-                0);
-    DTPOff =
-        SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
-                                   DAG.getTargetConstant(0, MVT::i32)),
-                0);
-
-    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
-  } else if (Model == TLSModel::GeneralDynamic) {
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0,
         AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
 
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
+    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
+                                       DAG.getTargetConstant(0, MVT::i32)),
+                    0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)),
+                    0);
+  } else if (Model == TLSModel::GeneralDynamic) {
     // The call needs a relocation too for linker relaxation. It doesn't make
     // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
     // the address.
     SDValue SymAddr =
         DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
 
     // Finally we can make a call to calculate the offset from tpidr_el0.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+    TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
   } else
     llvm_unreachable("Unsupported ELF TLS access model");
 
   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
 }
 
 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                      SelectionDAG &DAG) const {
   if (Subtarget->isTargetDarwin())
     return LowerDarwinGlobalTLSAddress(Op, DAG);
   else if (Subtarget->isTargetELF())
     return LowerELFGlobalTLSAddress(Op, DAG);
 
   llvm_unreachable("Unexpected platform trying to use TLS");
 }
 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
   SDValue LHS = Op.getOperand(2);
   SDValue RHS = Op.getOperand(3);
   SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
 
   // Handle f128 first, since lowering it will result in comparing the return
   // value of a libcall against zero, which is just what the rest of LowerBR_CC
   // is expecting to deal with.
   if (LHS.getValueType() == MVT::f128) {
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
     // If softenSetCCOperands returned a scalar, we need to compare the result
     // against zero to select between true and false values.
     if (!RHS.getNode()) {
       RHS = DAG.getConstant(0, LHS.getValueType());
       CC = ISD::SETNE;
     }
   }
 
   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
   // instruction.
   unsigned Opc = LHS.getOpcode();
   if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
       cast<ConstantSDNode>(RHS)->isOne() &&
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
        Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
     assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
            "Unexpected condition code.");
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
       return SDValue();
 
     // The actual operation with overflow check.
     AArch64CC::CondCode OFCC;
     SDValue Value, Overflow;
     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
 
     if (CC == ISD::SETNE)
       OFCC = getInvertedCondCode(OFCC);
     SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
 
     return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
                        CCVal, Overflow);
   }
 
   if (LHS.getValueType().isInteger()) {
     assert((LHS.getValueType() == RHS.getValueType()) &&
            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
 
     // If the RHS of the comparison is zero, we can potentially fold this
     // to a specialized branch.
     const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
     if (RHSC && RHSC->getZExtValue() == 0) {
       if (CC == ISD::SETEQ) {
         // See if we can use a TBZ to fold in an AND as well.
         // TBZ has a smaller branch displacement than CBZ.  If the offset is
         // out of bounds, a late MI-layer pass rewrites branches.
         // 403.gcc is an example that hits this case.
         if (LHS.getOpcode() == ISD::AND &&
             isa<ConstantSDNode>(LHS.getOperand(1)) &&
             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
           SDValue Test = LHS.getOperand(0);
           uint64_t Mask = LHS.getConstantOperandVal(1);
           return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
                              DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
         }
 
         return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
       } else if (CC == ISD::SETNE) {
         // See if we can use a TBZ to fold in an AND as well.
         // TBZ has a smaller branch displacement than CBZ.  If the offset is
         // out of bounds, a late MI-layer pass rewrites branches.
         // 403.gcc is an example that hits this case.
         if (LHS.getOpcode() == ISD::AND &&
             isa<ConstantSDNode>(LHS.getOperand(1)) &&
             isPowerOf2_64(LHS.getConstantOperandVal(1))) {
           SDValue Test = LHS.getOperand(0);
           uint64_t Mask = LHS.getConstantOperandVal(1);
           return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
                              DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
         }
 
         return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
       } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
         // Don't combine AND since emitComparison converts the AND to an ANDS
         // (a.k.a. TST) and the test in the test bit and branch instruction
         // becomes redundant.  This would also increase register pressure.
         uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
         return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
                            DAG.getConstant(Mask, MVT::i64), Dest);
       }
     }
     if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
         LHS.getOpcode() != ISD::AND) {
       // Don't combine AND since emitComparison converts the AND to an ANDS
       // (a.k.a. TST) and the test in the test bit and branch instruction
       // becomes redundant.  This would also increase register pressure.
       uint64_t Mask = LHS.getValueType().getSizeInBits() - 1;
       return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
                          DAG.getConstant(Mask, MVT::i64), Dest);
     }
 
     SDValue CCVal;
     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
                        Cmp);
   }
 
   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
 
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   // clean.  Some of them require two branches to implement.
   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
   AArch64CC::CondCode CC1, CC2;
   changeFPCCToAArch64CC(CC, CC1, CC2);
   SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
   SDValue BR1 =
       DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
   if (CC2 != AArch64CC::AL) {
     SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
     return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
                        Cmp);
   }
 
   return BR1;
 }
 
 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
 
   SDValue In1 = Op.getOperand(0);
   SDValue In2 = Op.getOperand(1);
   EVT SrcVT = In2.getValueType();
   if (SrcVT != VT) {
     if (SrcVT == MVT::f32 && VT == MVT::f64)
       In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
     else if (SrcVT == MVT::f64 && VT == MVT::f32)
       In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
     else
       // FIXME: Src type is different, bail out for now. Can VT really be a
       // vector type?
       return SDValue();
   }
 
   EVT VecVT;
   EVT EltVT;
   SDValue EltMask, VecVal1, VecVal2;
   if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
     EltVT = MVT::i32;
     VecVT = MVT::v4i32;
     EltMask = DAG.getConstant(0x80000000ULL, EltVT);
 
     if (!VT.isVector()) {
       VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
                                           DAG.getUNDEF(VecVT), In1);
       VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
                                           DAG.getUNDEF(VecVT), In2);
     } else {
       VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
       VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
     }
   } else if (VT == MVT::f64 || VT == MVT::v2f64) {
     EltVT = MVT::i64;
     VecVT = MVT::v2i64;
 
     // We want to materialize a mask with the the high bit set, but the AdvSIMD
     // immediate moves cannot materialize that in a single instruction for
     // 64-bit elements. Instead, materialize zero and then negate it.
     EltMask = DAG.getConstant(0, EltVT);
 
     if (!VT.isVector()) {
       VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
                                           DAG.getUNDEF(VecVT), In1);
       VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
                                           DAG.getUNDEF(VecVT), In2);
     } else {
       VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
       VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
     }
   } else {
     llvm_unreachable("Invalid type for copysign!");
   }
 
   std::vector<SDValue> BuildVectorOps;
   for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
     BuildVectorOps.push_back(EltMask);
 
   SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
 
   // If we couldn't materialize the mask above, then the mask vector will be
   // the zero vector, and we need to negate it here.
   if (VT == MVT::f64 || VT == MVT::v2f64) {
     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
     BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
     BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
   }
 
   SDValue Sel =
       DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
 
   if (VT == MVT::f32)
     return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
   else if (VT == MVT::f64)
     return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
   else
     return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
 }
 
 SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
   if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
           AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
     return SDValue();
 
   if (!Subtarget->hasNEON())
     return SDValue();
 
   // While there is no integer popcount instruction, it can
   // be more efficiently lowered to the following sequence that uses
   // AdvSIMD registers/instructions as long as the copies to/from
   // the AdvSIMD registers are cheap.
   //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
   //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
   //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
   //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
   SDValue Val = Op.getOperand(0);
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
 
   SDValue VecVal;
   if (VT == MVT::i32) {
     VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
     VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
                                        VecVal);
   } else {
     VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
   }
 
   SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
   SDValue UaddLV = DAG.getNode(
       ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
       DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
 
   if (VT == MVT::i64)
     UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
   return UaddLV;
 }
 
 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   if (Op.getValueType().isVector())
     return LowerVSETCC(Op, DAG);
 
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDLoc dl(Op);
 
   // We chose ZeroOrOneBooleanContents, so use zero and one.
   EVT VT = Op.getValueType();
   SDValue TVal = DAG.getConstant(1, VT);
   SDValue FVal = DAG.getConstant(0, VT);
 
   // Handle f128 first, since one possible outcome is a normal integer
   // comparison which gets picked up by the next if statement.
   if (LHS.getValueType() == MVT::f128) {
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
     // If softenSetCCOperands returned a scalar, use it.
     if (!RHS.getNode()) {
       assert(LHS.getValueType() == Op.getValueType() &&
              "Unexpected setcc expansion!");
       return LHS;
     }
   }
 
   if (LHS.getValueType().isInteger()) {
     SDValue CCVal;
     SDValue Cmp =
         getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
 
     // Note that we inverted the condition above, so we reverse the order of
     // the true and false operands here.  This will allow the setcc to be
     // matched to a single CSINC instruction.
     return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
   }
 
   // Now we know we're dealing with FP values.
   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
 
   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
   // and do the comparison.
   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
 
   AArch64CC::CondCode CC1, CC2;
   changeFPCCToAArch64CC(CC, CC1, CC2);
   if (CC2 == AArch64CC::AL) {
     changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
     SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
 
     // Note that we inverted the condition above, so we reverse the order of
     // the true and false operands here.  This will allow the setcc to be
     // matched to a single CSINC instruction.
     return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
   } else {
     // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
     // totally clean.  Some of them require two CSELs to implement.  As is in
     // this case, we emit the first CSEL and then emit a second using the output
     // of the first as the RHS.  We're effectively OR'ing the two CC's together.
 
     // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
     SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
     SDValue CS1 =
         DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
 
     SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
 }
 
 /// A SELECT_CC operation is really some kind of max or min if both values being
 /// compared are, in some sense, equal to the results in either case. However,
 /// it is permissible to compare f32 values and produce directly extended f64
 /// values.
 ///
 /// Extending the comparison operands would also be allowed, but is less likely
 /// to happen in practice since their use is right here. Note that truncate
 /// operations would *not* be semantically equivalent.
 static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
   if (Cmp == Result)
     return true;
 
   ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
   ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
   if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
       Result.getValueType() == MVT::f64) {
     bool Lossy;
     APFloat CmpVal = CCmp->getValueAPF();
     CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
     return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
   }
 
   return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
 }
 
 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue CC = Op->getOperand(0);
   SDValue TVal = Op->getOperand(1);
   SDValue FVal = Op->getOperand(2);
   SDLoc DL(Op);
 
   unsigned Opc = CC.getOpcode();
   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
   // instruction.
   if (CC.getResNo() == 1 &&
       (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
        Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
       return SDValue();
 
     AArch64CC::CondCode OFCC;
     SDValue Value, Overflow;
     std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
     SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
 
     return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
                        CCVal, Overflow);
   }
 
   if (CC.getOpcode() == ISD::SETCC)
     return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
                            cast<CondCodeSDNode>(CC.getOperand(2))->get());
   else
     return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
                            FVal, ISD::SETNE);
 }
 
 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
                                               SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue TVal = Op.getOperand(2);
   SDValue FVal = Op.getOperand(3);
   SDLoc dl(Op);
 
   // Handle f128 first, because it will result in a comparison of some RTLIB
   // call result against zero.
   if (LHS.getValueType() == MVT::f128) {
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
     // If softenSetCCOperands returned a scalar, we need to compare the result
     // against zero to select between true and false values.
     if (!RHS.getNode()) {
       RHS = DAG.getConstant(0, LHS.getValueType());
       CC = ISD::SETNE;
     }
   }
 
   // Handle integers first.
   if (LHS.getValueType().isInteger()) {
     assert((LHS.getValueType() == RHS.getValueType()) &&
            (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
 
     unsigned Opcode = AArch64ISD::CSEL;
 
     // If both the TVal and the FVal are constants, see if we can swap them in
     // order to for a CSINV or CSINC out of them.
     ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
     ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
 
     if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
       std::swap(TVal, FVal);
       std::swap(CTVal, CFVal);
       CC = ISD::getSetCCInverse(CC, true);
     } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
       std::swap(TVal, FVal);
       std::swap(CTVal, CFVal);
       CC = ISD::getSetCCInverse(CC, true);
     } else if (TVal.getOpcode() == ISD::XOR) {
       // If TVal is a NOT we want to swap TVal and FVal so that we can match
       // with a CSINV rather than a CSEL.
       ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
 
       if (CVal && CVal->isAllOnesValue()) {
         std::swap(TVal, FVal);
         std::swap(CTVal, CFVal);
         CC = ISD::getSetCCInverse(CC, true);
       }
     } else if (TVal.getOpcode() == ISD::SUB) {
       // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
       // that we can match with a CSNEG rather than a CSEL.
       ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
 
       if (CVal && CVal->isNullValue()) {
         std::swap(TVal, FVal);
         std::swap(CTVal, CFVal);
         CC = ISD::getSetCCInverse(CC, true);
       }
     } else if (CTVal && CFVal) {
       const int64_t TrueVal = CTVal->getSExtValue();
       const int64_t FalseVal = CFVal->getSExtValue();
       bool Swap = false;
 
       // If both TVal and FVal are constants, see if FVal is the
       // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
       // instead of a CSEL in that case.
       if (TrueVal == ~FalseVal) {
         Opcode = AArch64ISD::CSINV;
       } else if (TrueVal == -FalseVal) {
         Opcode = AArch64ISD::CSNEG;
       } else if (TVal.getValueType() == MVT::i32) {
         // If our operands are only 32-bit wide, make sure we use 32-bit
         // arithmetic for the check whether we can use CSINC. This ensures that
         // the addition in the check will wrap around properly in case there is
         // an overflow (which would not be the case if we do the check with
         // 64-bit arithmetic).
         const uint32_t TrueVal32 = CTVal->getZExtValue();
         const uint32_t FalseVal32 = CFVal->getZExtValue();
 
         if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
           Opcode = AArch64ISD::CSINC;
 
           if (TrueVal32 > FalseVal32) {
             Swap = true;
           }
         }
         // 64-bit check whether we can use CSINC.
       } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
         Opcode = AArch64ISD::CSINC;
 
         if (TrueVal > FalseVal) {
           Swap = true;
         }
       }
 
       // Swap TVal and FVal if necessary.
       if (Swap) {
         std::swap(TVal, FVal);
         std::swap(CTVal, CFVal);
         CC = ISD::getSetCCInverse(CC, true);
       }
 
       if (Opcode != AArch64ISD::CSEL) {
         // Drop FVal since we can get its value by simply inverting/negating
         // TVal.
         FVal = TVal;
       }
     }
 
     SDValue CCVal;
     SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
 
     EVT VT = Op.getValueType();
     return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
   }
 
   // Now we know we're dealing with FP values.
   assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
   assert(LHS.getValueType() == RHS.getValueType());
   EVT VT = Op.getValueType();
 
   // Try to match this select into a max/min operation, which have dedicated
   // opcode in the instruction set.
   // FIXME: This is not correct in the presence of NaNs, so we only enable this
   // in no-NaNs mode.
   if (getTargetMachine().Options.NoNaNsFPMath) {
     SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
     if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
         selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
       CC = ISD::getSetCCSwappedOperands(CC);
       std::swap(MinMaxLHS, MinMaxRHS);
     }
 
     if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
         selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
       switch (CC) {
       default:
         break;
       case ISD::SETGT:
       case ISD::SETGE:
       case ISD::SETUGT:
       case ISD::SETUGE:
       case ISD::SETOGT:
       case ISD::SETOGE:
         return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
         break;
       case ISD::SETLT:
       case ISD::SETLE:
       case ISD::SETULT:
       case ISD::SETULE:
       case ISD::SETOLT:
       case ISD::SETOLE:
         return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
         break;
       }
     }
   }
 
   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
   // and do the comparison.
   SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
 
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   // clean.  Some of them require two CSELs to implement.
   AArch64CC::CondCode CC1, CC2;
   changeFPCCToAArch64CC(CC, CC1, CC2);
   SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
   SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
 
   // If we need a second CSEL, emit it, using the output of the first as the
   // RHS.  We're effectively OR'ing the two CC's together.
   if (CC2 != AArch64CC::AL) {
     SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
     return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
 
   // Otherwise, return the output of the first CSEL.
   return CS1;
 }
 
 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
                                               SelectionDAG &DAG) const {
   // Jump table entries as PC relative offsets. No additional tweaking
   // is necessary here. Just get the address of the jump table.
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
   EVT PtrVT = getPointerTy();
   SDLoc DL(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       !Subtarget->isTargetMachO()) {
     const unsigned char MO_NC = AArch64II::MO_NC;
     return DAG.getNode(
         AArch64ISD::WrapperLarge, DL, PtrVT,
         DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
         DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
         DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
         DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
                                AArch64II::MO_G0 | MO_NC));
   }
 
   SDValue Hi =
       DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
   SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
                                       AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
   return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
 }
 
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   EVT PtrVT = getPointerTy();
   SDLoc DL(Op);
 
   if (getTargetMachine().getCodeModel() == CodeModel::Large) {
     // Use the GOT for the large code model on iOS.
     if (Subtarget->isTargetMachO()) {
       SDValue GotAddr = DAG.getTargetConstantPool(
           CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
           AArch64II::MO_GOT);
       return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
     }
 
     const unsigned char MO_NC = AArch64II::MO_NC;
     return DAG.getNode(
         AArch64ISD::WrapperLarge, DL, PtrVT,
         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
                                   CP->getOffset(), AArch64II::MO_G3),
         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
                                   CP->getOffset(), AArch64II::MO_G2 | MO_NC),
         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
                                   CP->getOffset(), AArch64II::MO_G1 | MO_NC),
         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
                                   CP->getOffset(), AArch64II::MO_G0 | MO_NC));
   } else {
     // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
     // ELF, the only valid one on Darwin.
     SDValue Hi =
         DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
                                   CP->getOffset(), AArch64II::MO_PAGE);
     SDValue Lo = DAG.getTargetConstantPool(
         CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
         AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
 
     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
     return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
   }
 }
 
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   EVT PtrVT = getPointerTy();
   SDLoc DL(Op);
   if (getTargetMachine().getCodeModel() == CodeModel::Large &&
       !Subtarget->isTargetMachO()) {
     const unsigned char MO_NC = AArch64II::MO_NC;
     return DAG.getNode(
         AArch64ISD::WrapperLarge, DL, PtrVT,
         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
         DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
   } else {
     SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
     SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
                                                              AArch64II::MO_NC);
     SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
     return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
   }
 }
 
 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
                                                  SelectionDAG &DAG) const {
   AArch64FunctionInfo *FuncInfo =
       DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
 
   SDLoc DL(Op);
   SDValue FR =
       DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
                       MachinePointerInfo(SV), false, false, 0);
 }
 
 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
                                                 SelectionDAG &DAG) const {
   // The layout of the va_list struct is specified in the AArch64 Procedure Call
   // Standard, section B.3.
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   SDLoc DL(Op);
 
   SDValue Chain = Op.getOperand(0);
   SDValue VAList = Op.getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   SmallVector<SDValue, 4> MemOps;
 
   // void *__stack at offset 0
   SDValue Stack =
       DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
                                 MachinePointerInfo(SV), false, false, 8));
 
   // void *__gr_top at offset 8
   int GPRSize = FuncInfo->getVarArgsGPRSize();
   if (GPRSize > 0) {
     SDValue GRTop, GRTopAddr;
 
     GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                             DAG.getConstant(8, getPointerTy()));
 
     GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
     GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
                         DAG.getConstant(GPRSize, getPointerTy()));
 
     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
                                   MachinePointerInfo(SV, 8), false, false, 8));
   }
 
   // void *__vr_top at offset 16
   int FPRSize = FuncInfo->getVarArgsFPRSize();
   if (FPRSize > 0) {
     SDValue VRTop, VRTopAddr;
     VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                             DAG.getConstant(16, getPointerTy()));
 
     VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
     VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
                         DAG.getConstant(FPRSize, getPointerTy()));
 
     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
                                   MachinePointerInfo(SV, 16), false, false, 8));
   }
 
   // int __gr_offs at offset 24
   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                                    DAG.getConstant(24, getPointerTy()));
   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
                                 GROffsAddr, MachinePointerInfo(SV, 24), false,
                                 false, 4));
 
   // int __vr_offs at offset 28
   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                                    DAG.getConstant(28, getPointerTy()));
   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
                                 VROffsAddr, MachinePointerInfo(SV, 28), false,
                                 false, 4));
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
                                             SelectionDAG &DAG) const {
   return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
                                      : LowerAAPCS_VASTART(Op, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
                                            SelectionDAG &DAG) const {
   // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
   // pointer.
   unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
   return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
                        Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
                        8, false, false, MachinePointerInfo(DestSV),
                        MachinePointerInfo(SrcSV));
 }
 
 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->isTargetDarwin() &&
          "automatic va_arg instruction only works on Darwin");
 
   const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   SDValue Addr = Op.getOperand(1);
   unsigned Align = Op.getConstantOperandVal(3);
 
   SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
                                MachinePointerInfo(V), false, false, false, 0);
   Chain = VAList.getValue(1);
 
   if (Align > 8) {
     assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
     VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                          DAG.getConstant(Align - 1, getPointerTy()));
     VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
                          DAG.getConstant(-(int64_t)Align, getPointerTy()));
   }
 
   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
   uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
 
   // Scalar integer and FP values smaller than 64 bits are implicitly extended
   // up to 64 bits.  At the very least, we have to increase the striding of the
   // vaargs list to match this, and for FP values we need to introduce
   // FP_ROUND nodes as well.
   if (VT.isInteger() && !VT.isVector())
     ArgSize = 8;
   bool NeedFPTrunc = false;
   if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
     ArgSize = 8;
     NeedFPTrunc = true;
   }
 
   // Increment the pointer, VAList, to the next vaarg
   SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                                DAG.getConstant(ArgSize, getPointerTy()));
   // Store the incremented VAList to the legalized pointer
   SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
                                  false, false, 0);
 
   // Load the actual argument out of the pointer VAList
   if (NeedFPTrunc) {
     // Load the value as an f64.
     SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
                                  MachinePointerInfo(), false, false, false, 0);
     // Round the value down to an f32.
     SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
                                    DAG.getIntPtrConstant(1));
     SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
     // Merge the rounded value with the chain output of the load.
     return DAG.getMergeValues(Ops, DL);
   }
 
   return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
                      false, false, 0);
 }
 
 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
                                               SelectionDAG &DAG) const {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDValue FrameAddr =
       DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
                             MachinePointerInfo(), false, false, false, 0);
   return FrameAddr;
 }
 
 // FIXME? Maybe this could be a TableGen attribute on some registers and
 // this table could be generated automatically from RegInfo.
 unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
                                                   EVT VT) const {
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", AArch64::SP)
                        .Default(0);
   if (Reg)
     return Reg;
   report_fatal_error("Invalid register name global variable");
 }
 
 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
                                                SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset = DAG.getConstant(8, getPointerTy());
     return DAG.getLoad(VT, DL, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
                        MachinePointerInfo(), false, false, false, 0);
   }
 
   // Return LR, which contains the return address. Mark it an implicit live-in.
   unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
 }
 
 /// LowerShiftRightParts - Lower SRA_PARTS, which returns two
 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
 SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
                                                     SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
   SDLoc dl(Op);
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt = Op.getOperand(2);
   SDValue ARMcc;
   unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
 
   assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
 
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                  DAG.getConstant(VTBits, MVT::i64), ShAmt);
   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                    DAG.getConstant(VTBits, MVT::i64));
   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
 
   SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
                                ISD::SETGE, dl, DAG);
   SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
 
   SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
   SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
   SDValue Lo =
       DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
 
   // AArch64 shifts larger than the register width are wrapped rather than
   // clamped, so we can't just emit "hi >> x".
   SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
   SDValue TrueValHi = Opc == ISD::SRA
                           ? DAG.getNode(Opc, dl, VT, ShOpHi,
                                         DAG.getConstant(VTBits - 1, MVT::i64))
                           : DAG.getConstant(0, VT);
   SDValue Hi =
       DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
 }
 
 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
 /// i64 values and take a 2 x i64 value to shift plus a shift amount.
 SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
                                                  SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
   SDLoc dl(Op);
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt = Op.getOperand(2);
   SDValue ARMcc;
 
   assert(Op.getOpcode() == ISD::SHL_PARTS);
   SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
                                  DAG.getConstant(VTBits, MVT::i64), ShAmt);
   SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
   SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
                                    DAG.getConstant(VTBits, MVT::i64));
   SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
   SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
 
   SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
 
   SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
                                ISD::SETGE, dl, DAG);
   SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
   SDValue Hi =
       DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
 
   // AArch64 shifts of larger than register sizes are wrapped rather than
   // clamped, so we can't just emit "lo << a" if a is too big.
   SDValue TrueValLo = DAG.getConstant(0, VT);
   SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
   SDValue Lo =
       DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
   return DAG.getMergeValues(Ops, dl);
 }
 
 bool AArch64TargetLowering::isOffsetFoldingLegal(
     const GlobalAddressSDNode *GA) const {
   // The AArch64 target doesn't support folding offsets into global addresses.
   return false;
 }
 
 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
   // FIXME: We should be able to handle f128 as well with a clever lowering.
   if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
     return true;
 
   if (VT == MVT::f64)
     return AArch64_AM::getFP64Imm(Imm) != -1;
   else if (VT == MVT::f32)
     return AArch64_AM::getFP32Imm(Imm) != -1;
   return false;
 }
 
 //===----------------------------------------------------------------------===//
 //                          AArch64 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 //                          AArch64 Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
 // Table of Constraints
 // TODO: This is the current set of constraints supported by ARM for the
 // compiler, not all of them may make sense, e.g. S may be difficult to support.
 //
 // r - A general register
 // w - An FP/SIMD register of some size in the range v0-v31
 // x - An FP/SIMD register of some size in the range v0-v15
 // I - Constant that can be used with an ADD instruction
 // J - Constant that can be used with a SUB instruction
 // K - Constant that can be used with a 32-bit logical instruction
 // L - Constant that can be used with a 64-bit logical instruction
 // M - Constant that can be used as a 32-bit MOV immediate
 // N - Constant that can be used as a 64-bit MOV immediate
 // Q - A memory reference with base register and no offset
 // S - A symbolic address
 // Y - Floating point constant zero
 // Z - Integer constant zero
 //
 //   Note that general register operands will be output using their 64-bit x
 // register name, whatever the size of the variable, unless the asm operand
 // is prefixed by the %w modifier. Floating-point and SIMD register operands
 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
 // %q modifier.
 
 /// getConstraintType - Given a constraint letter, return the type of
 /// constraint it is for this target.
 AArch64TargetLowering::ConstraintType
 AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default:
       break;
     case 'z':
       return C_Other;
     case 'x':
     case 'w':
       return C_RegisterClass;
     // An address with a single base register. Due to the way we
     // currently handle addresses it is the same as 'r'.
     case 'Q':
       return C_Memory;
     }
   }
   return TargetLowering::getConstraintType(Constraint);
 }
 
 /// Examine constraint type and operand type and determine a weight value.
 /// This object must already have been set up with the operand type
 /// and the current alternative constraint selected.
 TargetLowering::ConstraintWeight
 AArch64TargetLowering::getSingleConstraintMatchWeight(
     AsmOperandInfo &info, const char *constraint) const {
   ConstraintWeight weight = CW_Invalid;
   Value *CallOperandVal = info.CallOperandVal;
   // If we don't have a value, we can't do a match,
   // but allow it at the lowest weight.
   if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
   switch (*constraint) {
   default:
     weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
     break;
   case 'x':
   case 'w':
     if (type->isFloatingPointTy() || type->isVectorTy())
       weight = CW_Register;
     break;
   case 'z':
     weight = CW_Constant;
     break;
   }
   return weight;
 }
 
 std::pair<unsigned, const TargetRegisterClass *>
 AArch64TargetLowering::getRegForInlineAsmConstraint(
     const std::string &Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
       if (VT.getSizeInBits() == 64)
         return std::make_pair(0U, &AArch64::GPR64commonRegClass);
       return std::make_pair(0U, &AArch64::GPR32commonRegClass);
     case 'w':
       if (VT == MVT::f32)
         return std::make_pair(0U, &AArch64::FPR32RegClass);
       if (VT.getSizeInBits() == 64)
         return std::make_pair(0U, &AArch64::FPR64RegClass);
       if (VT.getSizeInBits() == 128)
         return std::make_pair(0U, &AArch64::FPR128RegClass);
       break;
     // The instructions that this constraint is designed for can
     // only take 128-bit registers so just use that regclass.
     case 'x':
       if (VT.getSizeInBits() == 128)
         return std::make_pair(0U, &AArch64::FPR128_loRegClass);
       break;
     }
   }
   if (StringRef("{cc}").equals_lower(Constraint))
     return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
 
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
   std::pair<unsigned, const TargetRegisterClass *> Res;
   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 
   // Not found as a standard register?
   if (!Res.second) {
     unsigned Size = Constraint.size();
     if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
         tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
       const std::string Reg =
           std::string(&Constraint[2], &Constraint[Size - 1]);
       int RegNo = atoi(Reg.c_str());
       if (RegNo >= 0 && RegNo <= 31) {
         // v0 - v31 are aliases of q0 - q31.
         // By default we'll emit v0-v31 for this unless there's a modifier where
         // we'll emit the correct register as well.
         Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
         Res.second = &AArch64::FPR128RegClass;
       }
     }
   }
 
   return Res;
 }
 
 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
 /// vector.  If it is invalid, don't add anything to Ops.
 void AArch64TargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
   SDValue Result;
 
   // Currently only support length 1 constraints.
   if (Constraint.length() != 1)
     return;
 
   char ConstraintLetter = Constraint[0];
   switch (ConstraintLetter) {
   default:
     break;
 
   // This set of constraints deal with valid constants for various instructions.
   // Validate and return a target constant for them if we can.
   case 'z': {
     // 'z' maps to xzr or wzr so it needs an input of 0.
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
     if (!C || C->getZExtValue() != 0)
       return;
 
     if (Op.getValueType() == MVT::i64)
       Result = DAG.getRegister(AArch64::XZR, MVT::i64);
     else
       Result = DAG.getRegister(AArch64::WZR, MVT::i32);
     break;
   }
 
   case 'I':
   case 'J':
   case 'K':
   case 'L':
   case 'M':
   case 'N':
     ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
     if (!C)
       return;
 
     // Grab the value and do some validation.
     uint64_t CVal = C->getZExtValue();
     switch (ConstraintLetter) {
     // The I constraint applies only to simple ADD or SUB immediate operands:
     // i.e. 0 to 4095 with optional shift by 12
     // The J constraint applies only to ADD or SUB immediates that would be
     // valid when negated, i.e. if [an add pattern] were to be output as a SUB
     // instruction [or vice versa], in other words -1 to -4095 with optional
     // left shift by 12.
     case 'I':
       if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
         break;
       return;
     case 'J': {
       uint64_t NVal = -C->getSExtValue();
       if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
         CVal = C->getSExtValue();
         break;
       }
       return;
     }
     // The K and L constraints apply *only* to logical immediates, including
     // what used to be the MOVI alias for ORR (though the MOVI alias has now
     // been removed and MOV should be used). So these constraints have to
     // distinguish between bit patterns that are valid 32-bit or 64-bit
     // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
     // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
     // versa.
     case 'K':
       if (AArch64_AM::isLogicalImmediate(CVal, 32))
         break;
       return;
     case 'L':
       if (AArch64_AM::isLogicalImmediate(CVal, 64))
         break;
       return;
     // The M and N constraints are a superset of K and L respectively, for use
     // with the MOV (immediate) alias. As well as the logical immediates they
     // also match 32 or 64-bit immediates that can be loaded either using a
     // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
     // (M) or 64-bit 0x1234000000000000 (N) etc.
     // As a note some of this code is liberally stolen from the asm parser.
     case 'M': {
       if (!isUInt<32>(CVal))
         return;
       if (AArch64_AM::isLogicalImmediate(CVal, 32))
         break;
       if ((CVal & 0xFFFF) == CVal)
         break;
       if ((CVal & 0xFFFF0000ULL) == CVal)
         break;
       uint64_t NCVal = ~(uint32_t)CVal;
       if ((NCVal & 0xFFFFULL) == NCVal)
         break;
       if ((NCVal & 0xFFFF0000ULL) == NCVal)
         break;
       return;
     }
     case 'N': {
       if (AArch64_AM::isLogicalImmediate(CVal, 64))
         break;
       if ((CVal & 0xFFFFULL) == CVal)
         break;
       if ((CVal & 0xFFFF0000ULL) == CVal)
         break;
       if ((CVal & 0xFFFF00000000ULL) == CVal)
         break;
       if ((CVal & 0xFFFF000000000000ULL) == CVal)
         break;
       uint64_t NCVal = ~CVal;
       if ((NCVal & 0xFFFFULL) == NCVal)
         break;
       if ((NCVal & 0xFFFF0000ULL) == NCVal)
         break;
       if ((NCVal & 0xFFFF00000000ULL) == NCVal)
         break;
       if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
         break;
       return;
     }
     default:
       return;
     }
 
     // All assembler immediates are 64-bit integers.
     Result = DAG.getTargetConstant(CVal, MVT::i64);
     break;
   }
 
   if (Result.getNode()) {
     Ops.push_back(Result);
     return;
   }
 
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
 //===----------------------------------------------------------------------===//
 //                     AArch64 Advanced SIMD Support
 //===----------------------------------------------------------------------===//
 
 /// WidenVector - Given a value in the V64 register class, produce the
 /// equivalent value in the V128 register class.
 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
   EVT VT = V64Reg.getValueType();
   unsigned NarrowSize = VT.getVectorNumElements();
   MVT EltTy = VT.getVectorElementType().getSimpleVT();
   MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
   SDLoc DL(V64Reg);
 
   return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
                      V64Reg, DAG.getConstant(0, MVT::i32));
 }
 
 /// getExtFactor - Determine the adjustment factor for the position when
 /// generating an "extract from vector registers" instruction.
 static unsigned getExtFactor(SDValue &V) {
   EVT EltType = V.getValueType().getVectorElementType();
   return EltType.getSizeInBits() / 8;
 }
 
 /// NarrowVector - Given a value in the V128 register class, produce the
 /// equivalent value in the V64 register class.
 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
   EVT VT = V128Reg.getValueType();
   unsigned WideSize = VT.getVectorNumElements();
   MVT EltTy = VT.getVectorElementType().getSimpleVT();
   MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
   SDLoc DL(V128Reg);
 
   return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
 }
 
 // Gather data to see if the operation can be modelled as a
 // shuffle in combination with VEXTs.
 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
                                                   SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
   struct ShuffleSourceInfo {
     SDValue Vec;
     unsigned MinElt;
     unsigned MaxElt;
 
     // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
     // be compatible with the shuffle we intend to construct. As a result
     // ShuffleVec will be some sliding window into the original Vec.
     SDValue ShuffleVec;
 
     // Code should guarantee that element i in Vec starts at element "WindowBase
     // + i * WindowScale in ShuffleVec".
     int WindowBase;
     int WindowScale;
 
     bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
     ShuffleSourceInfo(SDValue Vec)
         : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0),
           WindowScale(1) {}
   };
 
   // First gather all vectors used as an immediate source for this BUILD_VECTOR
   // node.
   SmallVector<ShuffleSourceInfo, 2> Sources;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.getOpcode() == ISD::UNDEF)
       continue;
     else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
       // A shuffle can only come from building a vector from various
       // elements of other vectors.
       return SDValue();
     }
 
     // Add this element source to the list if it's not already there.
     SDValue SourceVec = V.getOperand(0);
     auto Source = std::find(Sources.begin(), Sources.end(), SourceVec);
     if (Source == Sources.end())
       Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
 
     // Update the minimum and maximum lane number seen.
     unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
     Source->MinElt = std::min(Source->MinElt, EltNo);
     Source->MaxElt = std::max(Source->MaxElt, EltNo);
   }
 
   // Currently only do something sane when at most two source vectors
   // are involved.
   if (Sources.size() > 2)
     return SDValue();
 
   // Find out the smallest element size among result and two sources, and use
   // it as element size to build the shuffle_vector.
   EVT SmallestEltTy = VT.getVectorElementType();
   for (auto &Source : Sources) {
     EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
     if (SrcEltTy.bitsLT(SmallestEltTy)) {
       SmallestEltTy = SrcEltTy;
     }
   }
   unsigned ResMultiplier =
       VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits();
   NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
   EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
 
   // If the source vector is too wide or too narrow, we may nevertheless be able
   // to construct a compatible shuffle either by concatenating it with UNDEF or
   // extracting a suitable range of elements.
   for (auto &Src : Sources) {
     EVT SrcVT = Src.ShuffleVec.getValueType();
 
     if (SrcVT.getSizeInBits() == VT.getSizeInBits())
       continue;
 
     // This stage of the search produces a source with the same element type as
     // the original, but with a total width matching the BUILD_VECTOR output.
     EVT EltVT = SrcVT.getVectorElementType();
     unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
     EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
 
     if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
       assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
       // We can pad out the smaller vector for free, so if it's part of a
       // shuffle...
       Src.ShuffleVec =
           DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
                       DAG.getUNDEF(Src.ShuffleVec.getValueType()));
       continue;
     }
 
     assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
 
     if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
       // Span too large for a VEXT to cope
       return SDValue();
     }
 
     if (Src.MinElt >= NumSrcElts) {
       // The extraction can just take the second half
       Src.ShuffleVec =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(NumSrcElts, MVT::i64));
       Src.WindowBase = -NumSrcElts;
     } else if (Src.MaxElt < NumSrcElts) {
       // The extraction can just take the first half
       Src.ShuffleVec =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(0, MVT::i64));
     } else {
       // An actual VEXT is needed
       SDValue VEXTSrc1 =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(0, MVT::i64));
       SDValue VEXTSrc2 =
           DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
                       DAG.getConstant(NumSrcElts, MVT::i64));
       unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
 
       Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
                                    VEXTSrc2, DAG.getConstant(Imm, MVT::i32));
       Src.WindowBase = -Src.MinElt;
     }
   }
 
   // Another possible incompatibility occurs from the vector element types. We
   // can fix this by bitcasting the source vectors to the same type we intend
   // for the shuffle.
   for (auto &Src : Sources) {
     EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
     if (SrcEltTy == SmallestEltTy)
       continue;
     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
     Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
     Src.WindowBase *= Src.WindowScale;
   }
 
   // Final sanity check before we try to actually produce a shuffle.
   DEBUG(
     for (auto Src : Sources)
       assert(Src.ShuffleVec.getValueType() == ShuffleVT);
   );
 
   // The stars all align, our next step is to produce the mask for the shuffle.
   SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
   int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits();
   for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
     SDValue Entry = Op.getOperand(i);
     if (Entry.getOpcode() == ISD::UNDEF)
       continue;
 
     auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0));
     int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
 
     // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
     // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
     // segment.
     EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
     int BitsDefined = std::min(OrigEltTy.getSizeInBits(),
                                VT.getVectorElementType().getSizeInBits());
     int LanesDefined = BitsDefined / BitsPerShuffleLane;
 
     // This source is expected to fill ResMultiplier lanes of the final shuffle,
     // starting at the appropriate offset.
     int *LaneMask = &Mask[i * ResMultiplier];
 
     int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
     ExtractBase += NumElts * (Src - Sources.begin());
     for (int j = 0; j < LanesDefined; ++j)
       LaneMask[j] = ExtractBase + j;
   }
 
   // Final check before we try to produce nonsense...
   if (!isShuffleMaskLegal(Mask, ShuffleVT))
     return SDValue();
 
   SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
   for (unsigned i = 0; i < Sources.size(); ++i)
     ShuffleOps[i] = Sources[i].ShuffleVec;
 
   SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
                                          ShuffleOps[1], &Mask[0]);
   return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
 }
 
 // check if an EXT instruction can handle the shuffle mask when the
 // vector sources of the shuffle are the same.
 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Assume that the first shuffle index is not UNDEF.  Fail if it is.
   if (M[0] < 0)
     return false;
 
   Imm = M[0];
 
   // If this is a VEXT shuffle, the immediate value is the index of the first
   // element.  The other shuffle indices must be the successive elements after
   // the first one.
   unsigned ExpectedElt = Imm;
   for (unsigned i = 1; i < NumElts; ++i) {
     // Increment the expected index.  If it wraps around, just follow it
     // back to index zero and keep going.
     ++ExpectedElt;
     if (ExpectedElt == NumElts)
       ExpectedElt = 0;
 
     if (M[i] < 0)
       continue; // ignore UNDEF indices
     if (ExpectedElt != static_cast<unsigned>(M[i]))
       return false;
   }
 
   return true;
 }
 
 // check if an EXT instruction can handle the shuffle mask when the
 // vector sources of the shuffle are different.
 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
                       unsigned &Imm) {
   // Look for the first non-undef element.
   const int *FirstRealElt = std::find_if(M.begin(), M.end(),
       [](int Elt) {return Elt >= 0;});
 
   // Benefit form APInt to handle overflow when calculating expected element.
   unsigned NumElts = VT.getVectorNumElements();
   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
   APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
   // The following shuffle indices must be the successive elements after the
   // first real element.
   const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
       [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
   if (FirstWrongElt != M.end())
     return false;
 
   // The index of an EXT is the first element if it is not UNDEF.
   // Watch out for the beginning UNDEFs. The EXT index should be the expected
   // value of the first element.  E.g. 
   // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
   // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
   // ExpectedElt is the last mask index plus 1.
   Imm = ExpectedElt.getZExtValue();
 
   // There are two difference cases requiring to reverse input vectors.
   // For example, for vector <4 x i32> we have the following cases,
   // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
   // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
   // For both cases, we finally use mask <5, 6, 7, 0>, which requires
   // to reverse two input vectors.
   if (Imm < NumElts)
     ReverseEXT = true;
   else
     Imm -= NumElts;
 
   return true;
 }
 
 /// isREVMask - Check if a vector shuffle corresponds to a REV
 /// instruction with the specified blocksize.  (The order of the elements
 /// within each block of the vector is reversed.)
 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
   assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
          "Only possible block sizes for REV are: 16, 32, 64");
 
   unsigned EltSz = VT.getVectorElementType().getSizeInBits();
   if (EltSz == 64)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
   unsigned BlockElts = M[0] + 1;
   // If the first shuffle index is UNDEF, be optimistic.
   if (M[0] < 0)
     BlockElts = BlockSize / EltSz;
 
   if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
     return false;
 
   for (unsigned i = 0; i < NumElts; ++i) {
     if (M[i] < 0)
       continue; // ignore UNDEF indices
     if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
       return false;
   }
 
   return true;
 }
 
 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   unsigned Idx = WhichResult * NumElts / 2;
   for (unsigned i = 0; i != NumElts; i += 2) {
     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
       return false;
     Idx += 1;
   }
 
   return true;
 }
 
 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i != NumElts; ++i) {
     if (M[i] < 0)
       continue; // ignore UNDEF indices
     if ((unsigned)M[i] != 2 * i + WhichResult)
       return false;
   }
 
   return true;
 }
 
 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i < NumElts; i += 2) {
     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
       return false;
   }
   return true;
 }
 
 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   unsigned Idx = WhichResult * NumElts / 2;
   for (unsigned i = 0; i != NumElts; i += 2) {
     if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
         (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
       return false;
     Idx += 1;
   }
 
   return true;
 }
 
 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned Half = VT.getVectorNumElements() / 2;
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned j = 0; j != 2; ++j) {
     unsigned Idx = WhichResult;
     for (unsigned i = 0; i != Half; ++i) {
       int MIdx = M[i + j * Half];
       if (MIdx >= 0 && (unsigned)MIdx != Idx)
         return false;
       Idx += 2;
     }
   }
 
   return true;
 }
 
 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
   unsigned NumElts = VT.getVectorNumElements();
   WhichResult = (M[0] == 0 ? 0 : 1);
   for (unsigned i = 0; i < NumElts; i += 2) {
     if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
         (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
       return false;
   }
   return true;
 }
 
 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
                       bool &DstIsLeft, int &Anomaly) {
   if (M.size() != static_cast<size_t>(NumInputElements))
     return false;
 
   int NumLHSMatch = 0, NumRHSMatch = 0;
   int LastLHSMismatch = -1, LastRHSMismatch = -1;
 
   for (int i = 0; i < NumInputElements; ++i) {
     if (M[i] == -1) {
       ++NumLHSMatch;
       ++NumRHSMatch;
       continue;
     }
 
     if (M[i] == i)
       ++NumLHSMatch;
     else
       LastLHSMismatch = i;
 
     if (M[i] == i + NumInputElements)
       ++NumRHSMatch;
     else
       LastRHSMismatch = i;
   }
 
   if (NumLHSMatch == NumInputElements - 1) {
     DstIsLeft = true;
     Anomaly = LastLHSMismatch;
     return true;
   } else if (NumRHSMatch == NumInputElements - 1) {
     DstIsLeft = false;
     Anomaly = LastRHSMismatch;
     return true;
   }
 
   return false;
 }
 
 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
   if (VT.getSizeInBits() != 128)
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
 
   for (int I = 0, E = NumElts / 2; I != E; I++) {
     if (Mask[I] != I)
       return false;
   }
 
   int Offset = NumElts / 2;
   for (int I = NumElts / 2, E = NumElts; I != E; I++) {
     if (Mask[I] != I + SplitLHS * Offset)
       return false;
   }
 
   return true;
 }
 
 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue V0 = Op.getOperand(0);
   SDValue V1 = Op.getOperand(1);
   ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
 
   if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
       VT.getVectorElementType() != V1.getValueType().getVectorElementType())
     return SDValue();
 
   bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
 
   if (!isConcatMask(Mask, VT, SplitV0))
     return SDValue();
 
   EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
                                 VT.getVectorNumElements() / 2);
   if (SplitV0) {
     V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
                      DAG.getConstant(0, MVT::i64));
   }
   if (V1.getValueType().getSizeInBits() == 128) {
     V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
                      DAG.getConstant(0, MVT::i64));
   }
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
 }
 
 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
                                       SDLoc dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
   unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
 
   enum {
     OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
     OP_VREV,
     OP_VDUP0,
     OP_VDUP1,
     OP_VDUP2,
     OP_VDUP3,
     OP_VEXT1,
     OP_VEXT2,
     OP_VEXT3,
     OP_VUZPL, // VUZP, left result
     OP_VUZPR, // VUZP, right result
     OP_VZIPL, // VZIP, left result
     OP_VZIPR, // VZIP, right result
     OP_VTRNL, // VTRN, left result
     OP_VTRNR  // VTRN, right result
   };
 
   if (OpNum == OP_COPY) {
     if (LHSID == (1 * 9 + 2) * 9 + 3)
       return LHS;
     assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
     return RHS;
   }
 
   SDValue OpLHS, OpRHS;
   OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
   OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
   EVT VT = OpLHS.getValueType();
 
   switch (OpNum) {
   default:
     llvm_unreachable("Unknown shuffle opcode!");
   case OP_VREV:
     // VREV divides the vector in half and swaps within the half.
     if (VT.getVectorElementType() == MVT::i32 ||
         VT.getVectorElementType() == MVT::f32)
       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
     // vrev <4 x i16> -> REV32
     if (VT.getVectorElementType() == MVT::i16 ||
         VT.getVectorElementType() == MVT::f16)
       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
     // vrev <4 x i8> -> REV16
     assert(VT.getVectorElementType() == MVT::i8);
     return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
   case OP_VDUP0:
   case OP_VDUP1:
   case OP_VDUP2:
   case OP_VDUP3: {
     EVT EltTy = VT.getVectorElementType();
     unsigned Opcode;
     if (EltTy == MVT::i8)
       Opcode = AArch64ISD::DUPLANE8;
     else if (EltTy == MVT::i16)
       Opcode = AArch64ISD::DUPLANE16;
     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
       Opcode = AArch64ISD::DUPLANE32;
     else if (EltTy == MVT::i64 || EltTy == MVT::f64)
       Opcode = AArch64ISD::DUPLANE64;
     else
       llvm_unreachable("Invalid vector element type?");
 
     if (VT.getSizeInBits() == 64)
       OpLHS = WidenVector(OpLHS, DAG);
     SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
     return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
   }
   case OP_VEXT1:
   case OP_VEXT2:
   case OP_VEXT3: {
     unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
     return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
                        DAG.getConstant(Imm, MVT::i32));
   }
   case OP_VUZPL:
     return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   case OP_VUZPR:
     return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   case OP_VZIPL:
     return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   case OP_VZIPR:
     return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   case OP_VTRNL:
     return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   case OP_VTRNR:
     return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
                        OpRHS);
   }
 }
 
 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
                            SelectionDAG &DAG) {
   // Check to see if we can use the TBL instruction.
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   SDLoc DL(Op);
 
   EVT EltVT = Op.getValueType().getVectorElementType();
   unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
 
   SmallVector<SDValue, 8> TBLMask;
   for (int Val : ShuffleMask) {
     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
       unsigned Offset = Byte + Val * BytesPerElt;
       TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
     }
   }
 
   MVT IndexVT = MVT::v8i8;
   unsigned IndexLen = 8;
   if (Op.getValueType().getSizeInBits() == 128) {
     IndexVT = MVT::v16i8;
     IndexLen = 16;
   }
 
   SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
   SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
 
   SDValue Shuffle;
   if (V2.getNode()->getOpcode() == ISD::UNDEF) {
     if (IndexLen == 8)
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
     Shuffle = DAG.getNode(
         ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
         DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
         DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
                     makeArrayRef(TBLMask.data(), IndexLen)));
   } else {
     if (IndexLen == 8) {
       V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
       Shuffle = DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
           DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
           DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
                       makeArrayRef(TBLMask.data(), IndexLen)));
     } else {
       // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
       // cannot currently represent the register constraints on the input
       // table registers.
       //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
       //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
       //                               &TBLMask[0], IndexLen));
       Shuffle = DAG.getNode(
           ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
           DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
           DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
                       makeArrayRef(TBLMask.data(), IndexLen)));
     }
   }
   return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
 }
 
 static unsigned getDUPLANEOp(EVT EltType) {
   if (EltType == MVT::i8)
     return AArch64ISD::DUPLANE8;
   if (EltType == MVT::i16 || EltType == MVT::f16)
     return AArch64ISD::DUPLANE16;
   if (EltType == MVT::i32 || EltType == MVT::f32)
     return AArch64ISD::DUPLANE32;
   if (EltType == MVT::i64 || EltType == MVT::f64)
     return AArch64ISD::DUPLANE64;
 
   llvm_unreachable("Invalid vector element type?");
 }
 
 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                    SelectionDAG &DAG) const {
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
 
   // Convert shuffles that are directly supported on NEON to target-specific
   // DAG nodes, instead of keeping them as shuffles and matching them again
   // during code selection.  This is more efficient and avoids the possibility
   // of inconsistencies between legalization and selection.
   ArrayRef<int> ShuffleMask = SVN->getMask();
 
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
 
   if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
                                        V1.getValueType().getSimpleVT())) {
     int Lane = SVN->getSplatIndex();
     // If this is undef splat, generate it via "just" vdup, if possible.
     if (Lane == -1)
       Lane = 0;
 
     if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
       return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
                          V1.getOperand(0));
     // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
     // constant. If so, we can just reference the lane's definition directly.
     if (V1.getOpcode() == ISD::BUILD_VECTOR &&
         !isa<ConstantSDNode>(V1.getOperand(Lane)))
       return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
 
     // Otherwise, duplicate from the lane of the input vector.
     unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
 
     // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
     // to make a vector of the same size as this SHUFFLE. We can ignore the
     // extract entirely, and canonicalise the concat using WidenVector.
     if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
       Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
       V1 = V1.getOperand(0);
     } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
       unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
       Lane -= Idx * VT.getVectorNumElements() / 2;
       V1 = WidenVector(V1.getOperand(Idx), DAG);
     } else if (VT.getSizeInBits() == 64)
       V1 = WidenVector(V1, DAG);
 
     return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
   }
 
   if (isREVMask(ShuffleMask, VT, 64))
     return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
   if (isREVMask(ShuffleMask, VT, 32))
     return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
   if (isREVMask(ShuffleMask, VT, 16))
     return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
 
   bool ReverseEXT = false;
   unsigned Imm;
   if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
     if (ReverseEXT)
       std::swap(V1, V2);
     Imm *= getExtFactor(V1);
     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
                        DAG.getConstant(Imm, MVT::i32));
   } else if (V2->getOpcode() == ISD::UNDEF &&
              isSingletonEXTMask(ShuffleMask, VT, Imm)) {
     Imm *= getExtFactor(V1);
     return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
                        DAG.getConstant(Imm, MVT::i32));
   }
 
   unsigned WhichResult;
   if (isZIPMask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   }
   if (isUZPMask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   }
   if (isTRNMask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
   }
 
   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   }
   if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   }
   if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
     return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
   }
 
   SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
   if (Concat.getNode())
     return Concat;
 
   bool DstIsLeft;
   int Anomaly;
   int NumInputElements = V1.getValueType().getVectorNumElements();
   if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
     SDValue DstVec = DstIsLeft ? V1 : V2;
     SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
 
     SDValue SrcVec = V1;
     int SrcLane = ShuffleMask[Anomaly];
     if (SrcLane >= NumInputElements) {
       SrcVec = V2;
       SrcLane -= VT.getVectorNumElements();
     }
     SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
 
     EVT ScalarVT = VT.getVectorElementType();
 
     if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
       ScalarVT = MVT::i32;
 
     return DAG.getNode(
         ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
         DstLaneV);
   }
 
   // If the shuffle is not directly supported and it has 4 elements, use
   // the PerfectShuffle-generated table to synthesize it from other shuffles.
   unsigned NumElts = VT.getVectorNumElements();
   if (NumElts == 4) {
     unsigned PFIndexes[4];
     for (unsigned i = 0; i != 4; ++i) {
       if (ShuffleMask[i] < 0)
         PFIndexes[i] = 8;
       else
         PFIndexes[i] = ShuffleMask[i];
     }
 
     // Compute the index in the perfect shuffle table.
     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
                             PFIndexes[2] * 9 + PFIndexes[3];
     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
     unsigned Cost = (PFEntry >> 30);
 
     if (Cost <= 4)
       return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   }
 
   return GenerateTBL(Op, ShuffleMask, DAG);
 }
 
 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
                                APInt &UndefBits) {
   EVT VT = BVN->getValueType(0);
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
     unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
 
     for (unsigned i = 0; i < NumSplats; ++i) {
       CnstBits <<= SplatBitSize;
       UndefBits <<= SplatBitSize;
       CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
       UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
     }
 
     return true;
   }
 
   return false;
 }
 
 SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
                                               SelectionDAG &DAG) const {
   BuildVectorSDNode *BVN =
       dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
   SDValue LHS = Op.getOperand(0);
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
   if (!BVN)
     return Op;
 
   APInt CnstBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
     // We only have BIC vector immediate instruction, which is and-not.
     CnstBits = ~CnstBits;
 
     // We make use of a little bit of goto ickiness in order to avoid having to
     // duplicate the immediate matching logic for the undef toggled case.
     bool SecondTry = false;
   AttemptModImm:
 
     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
       CnstBits = CnstBits.zextOrTrunc(64);
       uint64_t CnstVal = CnstBits.getZExtValue();
 
       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
         SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
     }
 
     if (SecondTry)
       goto FailedModImm;
     SecondTry = true;
     CnstBits = ~UndefBits;
     goto AttemptModImm;
   }
 
 // We can always fall back to a non-immediate AND.
 FailedModImm:
   return Op;
 }
 
 // Specialized code to quickly find if PotentialBVec is a BuildVector that
 // consists of only the same constant int value, returned in reference arg
 // ConstVal
 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
                                      uint64_t &ConstVal) {
   BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
   if (!Bvec)
     return false;
   ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
   if (!FirstElt)
     return false;
   EVT VT = Bvec->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
   for (unsigned i = 1; i < NumElts; ++i)
     if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
       return false;
   ConstVal = FirstElt->getZExtValue();
   return true;
 }
 
 static unsigned getIntrinsicID(const SDNode *N) {
   unsigned Opcode = N->getOpcode();
   switch (Opcode) {
   default:
     return Intrinsic::not_intrinsic;
   case ISD::INTRINSIC_WO_CHAIN: {
     unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
     if (IID < Intrinsic::num_intrinsics)
       return IID;
     return Intrinsic::not_intrinsic;
   }
   }
 }
 
 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
 // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
 // Also, logical shift right -> sri, with the same structure.
 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
 
   if (!VT.isVector())
     return SDValue();
 
   SDLoc DL(N);
 
   // Is the first op an AND?
   const SDValue And = N->getOperand(0);
   if (And.getOpcode() != ISD::AND)
     return SDValue();
 
   // Is the second op an shl or lshr?
   SDValue Shift = N->getOperand(1);
   // This will have been turned into: AArch64ISD::VSHL vector, #shift
   // or AArch64ISD::VLSHR vector, #shift
   unsigned ShiftOpc = Shift.getOpcode();
   if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
     return SDValue();
   bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
 
   // Is the shift amount constant?
   ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
   if (!C2node)
     return SDValue();
 
   // Is the and mask vector all constant?
   uint64_t C1;
   if (!isAllConstantBuildVector(And.getOperand(1), C1))
     return SDValue();
 
   // Is C1 == ~C2, taking into account how much one can shift elements of a
   // particular size?
   uint64_t C2 = C2node->getZExtValue();
   unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
   if (C2 > ElemSizeInBits)
     return SDValue();
   unsigned ElemMask = (1 << ElemSizeInBits) - 1;
   if ((C1 & ElemMask) != (~C2 & ElemMask))
     return SDValue();
 
   SDValue X = And.getOperand(0);
   SDValue Y = Shift.getOperand(0);
 
   unsigned Intrin =
       IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
   SDValue ResultSLI =
       DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                   DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
 
   DEBUG(dbgs() << "aarch64-lower: transformed: \n");
   DEBUG(N->dump(&DAG));
   DEBUG(dbgs() << "into: \n");
   DEBUG(ResultSLI->dump(&DAG));
 
   ++NumShiftInserts;
   return ResultSLI;
 }
 
 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
   if (EnableAArch64SlrGeneration) {
     SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
     if (Res.getNode())
       return Res;
   }
 
   BuildVectorSDNode *BVN =
       dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
   SDValue LHS = Op.getOperand(1);
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
   // OR commutes, so try swapping the operands.
   if (!BVN) {
     LHS = Op.getOperand(0);
     BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
   }
   if (!BVN)
     return Op;
 
   APInt CnstBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
     // We make use of a little bit of goto ickiness in order to avoid having to
     // duplicate the immediate matching logic for the undef toggled case.
     bool SecondTry = false;
   AttemptModImm:
 
     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
       CnstBits = CnstBits.zextOrTrunc(64);
       uint64_t CnstVal = CnstBits.getZExtValue();
 
       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
         SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
     }
 
     if (SecondTry)
       goto FailedModImm;
     SecondTry = true;
     CnstBits = UndefBits;
     goto AttemptModImm;
   }
 
 // We can always fall back to a non-immediate OR.
 FailedModImm:
   return Op;
 }
 
 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
 // be truncated to fit element width.
 static SDValue NormalizeBuildVector(SDValue Op,
                                     SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   EVT EltTy= VT.getVectorElementType();
 
   if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
     return Op;
 
   SmallVector<SDValue, 16> Ops;
   for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) {
     SDValue Lane = Op.getOperand(I);
     if (Lane.getOpcode() == ISD::Constant) {
       APInt LowBits(EltTy.getSizeInBits(),
                     cast<ConstantSDNode>(Lane)->getZExtValue());
       Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32);
     }
     Ops.push_back(Lane);
   }
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
 }
 
 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                                  SelectionDAG &DAG) const {
   SDLoc dl(Op);
   EVT VT = Op.getValueType();
   Op = NormalizeBuildVector(Op, DAG);
   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
 
   APInt CnstBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
   if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
     // We make use of a little bit of goto ickiness in order to avoid having to
     // duplicate the immediate matching logic for the undef toggled case.
     bool SecondTry = false;
   AttemptModImm:
 
     if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
       CnstBits = CnstBits.zextOrTrunc(64);
       uint64_t CnstVal = CnstBits.getZExtValue();
 
       // Certain magic vector constants (used to express things like NOT
       // and NEG) are passed through unmodified.  This allows codegen patterns
       // for these operations to match.  Special-purpose patterns will lower
       // these immediates to MOVIs if it proves necessary.
       if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
         return Op;
 
       // The many faces of MOVI...
       if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
         if (VT.getSizeInBits() == 128) {
           SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
                                     DAG.getConstant(CnstVal, MVT::i32));
           return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
         }
 
         // Support the V64 version via subregister insertion.
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
                                   DAG.getConstant(CnstVal, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(264, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(272, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
         SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       // The few faces of FMOV...
       if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
           VT.getSizeInBits() == 128) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
         SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
                                   DAG.getConstant(CnstVal, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       // The many faces of MVNI...
       CnstVal = ~CnstVal;
       if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(16, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(24, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(0, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
         SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(8, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(264, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
 
       if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
         CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
         MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
         SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
                                   DAG.getConstant(CnstVal, MVT::i32),
                                   DAG.getConstant(272, MVT::i32));
         return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
       }
     }
 
     if (SecondTry)
       goto FailedModImm;
     SecondTry = true;
     CnstBits = UndefBits;
     goto AttemptModImm;
   }
 FailedModImm:
 
   // Scan through the operands to find some interesting properties we can
   // exploit:
   //   1) If only one value is used, we can use a DUP, or
   //   2) if only the low element is not undef, we can just insert that, or
   //   3) if only one constant value is used (w/ some non-constant lanes),
   //      we can splat the constant value into the whole vector then fill
   //      in the non-constant lanes.
   //   4) FIXME: If different constant values are used, but we can intelligently
   //             select the values we'll be overwriting for the non-constant
   //             lanes such that we can directly materialize the vector
   //             some other way (MOVI, e.g.), we can be sneaky.
   unsigned NumElts = VT.getVectorNumElements();
   bool isOnlyLowElement = true;
   bool usesOnlyOneValue = true;
   bool usesOnlyOneConstantValue = true;
   bool isConstant = true;
   unsigned NumConstantLanes = 0;
   SDValue Value;
   SDValue ConstantValue;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
     if (V.getOpcode() == ISD::UNDEF)
       continue;
     if (i > 0)
       isOnlyLowElement = false;
     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
       isConstant = false;
 
     if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
       ++NumConstantLanes;
       if (!ConstantValue.getNode())
         ConstantValue = V;
       else if (ConstantValue != V)
         usesOnlyOneConstantValue = false;
     }
 
     if (!Value.getNode())
       Value = V;
     else if (V != Value)
       usesOnlyOneValue = false;
   }
 
   if (!Value.getNode())
     return DAG.getUNDEF(VT);
 
   if (isOnlyLowElement)
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
 
   // Use DUP for non-constant splats.  For f32 constant splats, reduce to
   // i32 and try again.
   if (usesOnlyOneValue) {
     if (!isConstant) {
       if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
           Value.getValueType() != VT)
         return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
 
       // This is actually a DUPLANExx operation, which keeps everything vectory.
 
       // DUPLANE works on 128-bit vectors, widen it if necessary.
       SDValue Lane = Value.getOperand(1);
       Value = Value.getOperand(0);
       if (Value.getValueType().getSizeInBits() == 64)
         Value = WidenVector(Value, DAG);
 
       unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
       return DAG.getNode(Opcode, dl, VT, Value, Lane);
     }
 
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       MVT NewType =
           (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
       for (unsigned i = 0; i < NumElts; ++i)
         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
       SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
       Val = LowerBUILD_VECTOR(Val, DAG);
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
     }
   }
 
   // If there was only one constant value used and for more than one lane,
   // start by splatting that value, then replace the non-constant lanes. This
   // is better than the default, which will perform a separate initialization
   // for each lane.
   if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
     SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
     // Now insert the non-constant lanes.
     for (unsigned i = 0; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
       if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
         // Note that type legalization likely mucked about with the VT of the
         // source operand, so we may have to convert it here before inserting.
         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
       }
     }
     return Val;
   }
 
   // If all elements are constants and the case above didn't get hit, fall back
   // to the default expansion, which will generate a load from the constant
   // pool.
   if (isConstant)
     return SDValue();
 
   // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
   if (NumElts >= 4) {
     SDValue shuffle = ReconstructShuffle(Op, DAG);
     if (shuffle != SDValue())
       return shuffle;
   }
 
   // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
   // know the default expansion would otherwise fall back on something even
   // worse. For a vector with one or two non-undef values, that's
   // scalar_to_vector for the elements followed by a shuffle (provided the
   // shuffle is valid for the target) and materialization element by element
   // on the stack followed by a load for everything else.
   if (!isConstant && !usesOnlyOneValue) {
     SDValue Vec = DAG.getUNDEF(VT);
     SDValue Op0 = Op.getOperand(0);
     unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
     unsigned i = 0;
     // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
     // a) Avoid a RMW dependency on the full vector register, and
     // b) Allow the register coalescer to fold away the copy if the
     //    value is already in an S or D register.
     if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
       unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
       MachineSDNode *N =
           DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
                              DAG.getTargetConstant(SubIdx, MVT::i32));
       Vec = SDValue(N, 0);
       ++i;
     }
     for (; i < NumElts; ++i) {
       SDValue V = Op.getOperand(i);
       if (V.getOpcode() == ISD::UNDEF)
         continue;
       SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
       Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
     }
     return Vec;
   }
 
   // Just use the default expansion. We failed to find a better alternative.
   return SDValue();
 }
 
 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
                                                       SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
 
   // Check for non-constant or out of range lane.
   EVT VT = Op.getOperand(0).getValueType();
   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
     return SDValue();
 
 
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
       VT == MVT::v8f16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
     return SDValue();
 
   // For V64 types, we perform insertion by expanding the value
   // to a V128 type and perform the insertion on that.
   SDLoc DL(Op);
   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
   EVT WideTy = WideVec.getValueType();
 
   SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
                              Op.getOperand(1), Op.getOperand(2));
   // Re-narrow the resultant vector.
   return NarrowVector(Node, DAG);
 }
 
 SDValue
 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                                SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
 
   // Check for non-constant or out of range lane.
   EVT VT = Op.getOperand(0).getValueType();
   ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
     return SDValue();
 
 
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
       VT == MVT::v8f16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
       VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
     return SDValue();
 
   // For V64 types, we perform extraction by expanding the value
   // to a V128 type and perform the extraction on that.
   SDLoc DL(Op);
   SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
   EVT WideTy = WideVec.getValueType();
 
   EVT ExtrTy = WideTy.getVectorElementType();
   if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
     ExtrTy = MVT::i32;
 
   // For extractions, we just return the result directly.
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
                      Op.getOperand(1));
 }
 
 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                                       SelectionDAG &DAG) const {
   EVT VT = Op.getOperand(0).getValueType();
   SDLoc dl(Op);
   // Just in case...
   if (!VT.isVector())
     return SDValue();
 
   ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
   if (!Cst)
     return SDValue();
   unsigned Val = Cst->getZExtValue();
 
   unsigned Size = Op.getValueType().getSizeInBits();
   if (Val == 0) {
     switch (Size) {
     case 8:
       return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
                                         Op.getOperand(0));
     case 16:
       return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
                                         Op.getOperand(0));
     case 32:
       return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
                                         Op.getOperand(0));
     case 64:
       return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
                                         Op.getOperand(0));
     default:
       llvm_unreachable("Unexpected vector type in extract_subvector!");
     }
   }
   // If this is extracting the upper 64-bits of a 128-bit vector, we match
   // that directly.
   if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
     return Op;
 
   return SDValue();
 }
 
 bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                                EVT VT) const {
   if (VT.getVectorNumElements() == 4 &&
       (VT.is128BitVector() || VT.is64BitVector())) {
     unsigned PFIndexes[4];
     for (unsigned i = 0; i != 4; ++i) {
       if (M[i] < 0)
         PFIndexes[i] = 8;
       else
         PFIndexes[i] = M[i];
     }
 
     // Compute the index in the perfect shuffle table.
     unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
                             PFIndexes[2] * 9 + PFIndexes[3];
     unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
     unsigned Cost = (PFEntry >> 30);
 
     if (Cost <= 4)
       return true;
   }
 
   bool DummyBool;
   int DummyInt;
   unsigned DummyUnsigned;
 
   return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
           isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
           // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
           isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
           isZIPMask(M, VT, DummyUnsigned) ||
           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
           isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
           isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
           isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
           isConcatMask(M, VT, VT.getSizeInBits() == 128));
 }
 
 /// getVShiftImm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift operation, where all the elements of the
 /// build_vector must have the same constant integer value.
 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
   // Ignore bit_converts.
   while (Op.getOpcode() == ISD::BITCAST)
     Op = Op.getOperand(0);
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
   if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
                                     HasAnyUndefs, ElementBits) ||
       SplatBitSize > ElementBits)
     return false;
   Cnt = SplatBits.getSExtValue();
   return true;
 }
 
 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift left operation.  That value must be in the range:
 ///   0 <= Value < ElementBits for a left shift; or
 ///   0 <= Value <= ElementBits for a long left shift.
 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
   if (!getVShiftImm(Op, ElementBits, Cnt))
     return false;
   return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
 }
 
 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
 /// operand of a vector shift right operation.  For a shift opcode, the value
 /// is positive, but for an intrinsic the value count must be negative. The
 /// absolute value must be in the range:
 ///   1 <= |Value| <= ElementBits for a right shift; or
 ///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
                          int64_t &Cnt) {
   assert(VT.isVector() && "vector shift count is not a vector type");
   unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
   if (!getVShiftImm(Op, ElementBits, Cnt))
     return false;
   if (isIntrinsic)
     Cnt = -Cnt;
   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
 }
 
 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
                                                       SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
   int64_t Cnt;
 
   if (!Op.getOperand(1).getValueType().isVector())
     return Op;
   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
 
   switch (Op.getOpcode()) {
   default:
     llvm_unreachable("unexpected shift opcode");
 
   case ISD::SHL:
     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
       return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
                          DAG.getConstant(Cnt, MVT::i32));
     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                        DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32),
                        Op.getOperand(0), Op.getOperand(1));
   case ISD::SRA:
   case ISD::SRL:
     // Right shift immediate
     if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
         Cnt < EltSize) {
       unsigned Opc =
           (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
       return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
                          DAG.getConstant(Cnt, MVT::i32));
     }
 
     // Right shift register.  Note, there is not a shift right register
     // instruction, but the shift left register instruction takes a signed
     // value, where negative numbers specify a right shift.
     unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
                                                 : Intrinsic::aarch64_neon_ushl;
     // negate the shift amount
     SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
     SDValue NegShiftLeft =
         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                     DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
     return NegShiftLeft;
   }
 
   return SDValue();
 }
 
 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
                                     AArch64CC::CondCode CC, bool NoNans, EVT VT,
                                     SDLoc dl, SelectionDAG &DAG) {
   EVT SrcVT = LHS.getValueType();
   assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
          "function only supposed to emit natural comparisons");
 
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
   APInt CnstBits(VT.getSizeInBits(), 0);
   APInt UndefBits(VT.getSizeInBits(), 0);
   bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
   bool IsZero = IsCnst && (CnstBits == 0);
 
   if (SrcVT.getVectorElementType().isFloatingPoint()) {
     switch (CC) {
     default:
       return SDValue();
     case AArch64CC::NE: {
       SDValue Fcmeq;
       if (IsZero)
         Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
       else
         Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
       return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
     }
     case AArch64CC::EQ:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
     case AArch64CC::GE:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
     case AArch64CC::GT:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
     case AArch64CC::LS:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
     case AArch64CC::LT:
       if (!NoNans)
         return SDValue();
     // If we ignore NaNs then we can use to the MI implementation.
     // Fallthrough.
     case AArch64CC::MI:
       if (IsZero)
         return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
       return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
     }
   }
 
   switch (CC) {
   default:
     return SDValue();
   case AArch64CC::NE: {
     SDValue Cmeq;
     if (IsZero)
       Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
     else
       Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
     return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
   }
   case AArch64CC::EQ:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
   case AArch64CC::GE:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
   case AArch64CC::GT:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
   case AArch64CC::LE:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
   case AArch64CC::LS:
     return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
   case AArch64CC::LO:
     return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
   case AArch64CC::LT:
     if (IsZero)
       return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
     return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
   case AArch64CC::HI:
     return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
   case AArch64CC::HS:
     return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
   }
 }
 
 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
                                            SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
   SDLoc dl(Op);
 
   if (LHS.getValueType().getVectorElementType().isInteger()) {
     assert(LHS.getValueType() == RHS.getValueType());
     AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
     SDValue Cmp =
         EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
     return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
   }
 
   assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
          LHS.getValueType().getVectorElementType() == MVT::f64);
 
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   // clean.  Some of them require two branches to implement.
   AArch64CC::CondCode CC1, CC2;
   bool ShouldInvert;
   changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
 
   bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
   SDValue Cmp =
       EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
   if (!Cmp.getNode())
     return SDValue();
 
   if (CC2 != AArch64CC::AL) {
     SDValue Cmp2 =
         EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
     if (!Cmp2.getNode())
       return SDValue();
 
     Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
   }
 
   Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
 
   if (ShouldInvert)
     return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
 
   return Cmp;
 }
 
 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
 /// specified in the intrinsic calls.
 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                const CallInst &I,
                                                unsigned Intrinsic) const {
   switch (Intrinsic) {
   case Intrinsic::aarch64_neon_ld2:
   case Intrinsic::aarch64_neon_ld3:
   case Intrinsic::aarch64_neon_ld4:
   case Intrinsic::aarch64_neon_ld1x2:
   case Intrinsic::aarch64_neon_ld1x3:
   case Intrinsic::aarch64_neon_ld1x4:
   case Intrinsic::aarch64_neon_ld2lane:
   case Intrinsic::aarch64_neon_ld3lane:
   case Intrinsic::aarch64_neon_ld4lane:
   case Intrinsic::aarch64_neon_ld2r:
   case Intrinsic::aarch64_neon_ld3r:
   case Intrinsic::aarch64_neon_ld4r: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
     uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
     Info.align = 0;
     Info.vol = false; // volatile loads with NEON intrinsics not supported
     Info.readMem = true;
     Info.writeMem = false;
     return true;
   }
   case Intrinsic::aarch64_neon_st2:
   case Intrinsic::aarch64_neon_st3:
   case Intrinsic::aarch64_neon_st4:
   case Intrinsic::aarch64_neon_st1x2:
   case Intrinsic::aarch64_neon_st1x3:
   case Intrinsic::aarch64_neon_st1x4:
   case Intrinsic::aarch64_neon_st2lane:
   case Intrinsic::aarch64_neon_st3lane:
   case Intrinsic::aarch64_neon_st4lane: {
     Info.opc = ISD::INTRINSIC_VOID;
     // Conservatively set memVT to the entire set of vectors stored.
     unsigned NumElts = 0;
     for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
       NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
     Info.offset = 0;
     Info.align = 0;
     Info.vol = false; // volatile stores with NEON intrinsics not supported
     Info.readMem = false;
     Info.writeMem = true;
     return true;
   }
   case Intrinsic::aarch64_ldaxr:
   case Intrinsic::aarch64_ldxr: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
     Info.vol = true;
     Info.readMem = true;
     Info.writeMem = false;
     return true;
   }
   case Intrinsic::aarch64_stlxr:
   case Intrinsic::aarch64_stxr: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
     Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
     Info.vol = true;
     Info.readMem = false;
     Info.writeMem = true;
     return true;
   }
   case Intrinsic::aarch64_ldaxp:
   case Intrinsic::aarch64_ldxp: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.align = 16;
     Info.vol = true;
     Info.readMem = true;
     Info.writeMem = false;
     return true;
   }
   case Intrinsic::aarch64_stlxp:
   case Intrinsic::aarch64_stxp: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i128;
     Info.ptrVal = I.getArgOperand(2);
     Info.offset = 0;
     Info.align = 16;
     Info.vol = true;
     Info.readMem = false;
     Info.writeMem = true;
     return true;
   }
   default:
     break;
   }
 
   return false;
 }
 
 // Truncations from 64-bit GPR to 32-bit GPR is free.
 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   return NumBits1 > NumBits2;
 }
 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
   return NumBits1 > NumBits2;
 }
 
 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
 // 64-bit GPR.
 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
   unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
   unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
   return NumBits1 == 32 && NumBits2 == 64;
 }
 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
   if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
   return NumBits1 == 32 && NumBits2 == 64;
 }
 
 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   EVT VT1 = Val.getValueType();
   if (isZExtFree(VT1, VT2)) {
     return true;
   }
 
   if (Val.getOpcode() != ISD::LOAD)
     return false;
 
   // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
   return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
           VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
           VT1.getSizeInBits() <= 32);
 }
 
 bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
                                           unsigned &RequiredAligment) const {
   if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
     return false;
   // Cyclone supports unaligned accesses.
   RequiredAligment = 0;
   unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
   return NumBits == 32 || NumBits == 64;
 }
 
 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
                                           unsigned &RequiredAligment) const {
   if (!LoadedType.isSimple() ||
       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
     return false;
   // Cyclone supports unaligned accesses.
   RequiredAligment = 0;
   unsigned NumBits = LoadedType.getSizeInBits();
   return NumBits == 32 || NumBits == 64;
 }
 
 static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
                        unsigned AlignCheck) {
   return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
           (DstAlign == 0 || DstAlign % AlignCheck == 0));
 }
 
 EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
                                                unsigned SrcAlign, bool IsMemset,
                                                bool ZeroMemset,
                                                bool MemcpyStrSrc,
                                                MachineFunction &MF) const {
   // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
   // instruction to materialize the v2i64 zero and one store (with restrictive
   // addressing mode). Just do two i64 store of zero-registers.
   bool Fast;
   const Function *F = MF.getFunction();
   if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                        Attribute::NoImplicitFloat) &&
       (memOpAlign(SrcAlign, DstAlign, 16) ||
        (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
     return MVT::f128;
 
   return Size >= 8 ? MVT::i64 : MVT::i32;
 }
 
 // 12-bit optionally shifted immediates are legal for adds.
 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
   if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
     return true;
   return false;
 }
 
 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
 // immediates is the same as for an add or a sub.
 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
   if (Immed < 0)
     Immed *= -1;
   return isLegalAddImmediate(Immed);
 }
 
 /// isLegalAddressingMode - Return true if the addressing mode represented
 /// by AM is legal for this target, for a load/store of the specified type.
 bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
                                                   Type *Ty) const {
   // AArch64 has five basic addressing modes:
   //  reg
   //  reg + 9-bit signed offset
   //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
   //  reg1 + reg2
   //  reg + SIZE_IN_BYTES * reg
 
   // No global is ever allowed as a base.
   if (AM.BaseGV)
     return false;
 
   // No reg+reg+imm addressing.
   if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
     return false;
 
   // check reg + imm case:
   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
   uint64_t NumBytes = 0;
   if (Ty->isSized()) {
     uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
     NumBytes = NumBits / 8;
     if (!isPowerOf2_64(NumBits))
       NumBytes = 0;
   }
 
   if (!AM.Scale) {
     int64_t Offset = AM.BaseOffs;
 
     // 9-bit signed offset
     if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
       return true;
 
     // 12-bit unsigned offset
     unsigned shift = Log2_64(NumBytes);
     if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
         // Must be a multiple of NumBytes (NumBytes is a power of 2)
         (Offset >> shift) << shift == Offset)
       return true;
     return false;
   }
 
   // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
 
   if (!AM.Scale || AM.Scale == 1 ||
       (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
     return true;
   return false;
 }
 
 int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
                                                 Type *Ty) const {
   // Scaling factors are not free at all.
   // Operands                     | Rt Latency
   // -------------------------------------------
   // Rt, [Xn, Xm]                 | 4
   // -------------------------------------------
   // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
   // Rt, [Xn, Wm, <extend> #imm]  |
   if (isLegalAddressingMode(AM, Ty))
     // Scale represents reg2 * scale, thus account for 1 if
     // it is not equal to 0 or 1.
     return AM.Scale != 0 && AM.Scale != 1;
   return -1;
 }
 
 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
   VT = VT.getScalarType();
 
   if (!VT.isSimple())
     return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f32:
   case MVT::f64:
     return true;
   default:
     break;
   }
 
   return false;
 }
 
 const MCPhysReg *
 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
   // LR is a callee-save register, but we must treat it as clobbered by any call
   // site. Hence we include LR in the scratch registers, which are in turn added
   // as implicit-defs for stackmaps and patchpoints.
   static const MCPhysReg ScratchRegs[] = {
     AArch64::X16, AArch64::X17, AArch64::LR, 0
   };
   return ScratchRegs;
 }
 
 bool
 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
   EVT VT = N->getValueType(0);
     // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
     // it with shift to let it be lowered to UBFX.
   if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
       isa<ConstantSDNode>(N->getOperand(1))) {
     uint64_t TruncMask = N->getConstantOperandVal(1);
     if (isMask_64(TruncMask) &&
       N->getOperand(0).getOpcode() == ISD::SRL &&
       isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
       return false;
   }
   return true;
 }
 
 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                                               Type *Ty) const {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
   if (BitSize == 0)
     return false;
 
   int64_t Val = Imm.getSExtValue();
   if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
     return true;
 
   if ((int64_t)Val < 0)
     Val = ~Val;
   if (BitSize == 32)
     Val &= (1LL << 32) - 1;
 
   unsigned LZ = countLeadingZeros((uint64_t)Val);
   unsigned Shift = (63 - LZ) / 16;
   // MOVZ is free so return true for one or fewer MOVK.
   return (Shift < 3) ? true : false;
 }
 
 // Generate SUBS and CSEL for integer abs.
 static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDLoc DL(N);
 
   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
   // and change it to SUB and CSEL.
   if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
       N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
       N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
     if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
       if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
         SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
                                   N0.getOperand(0));
         // Generate SUBS & CSEL.
         SDValue Cmp =
             DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
                         N0.getOperand(0), DAG.getConstant(0, VT));
         return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
                            DAG.getConstant(AArch64CC::PL, MVT::i32),
                            SDValue(Cmp.getNode(), 1));
       }
   return SDValue();
 }
 
 // performXorCombine - Attempts to handle integer ABS.
 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   return performIntegerAbsCombine(N, DAG);
 }
 
 SDValue
 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                      SelectionDAG &DAG,
                                      std::vector<SDNode *> *Created) const {
   // fold (sdiv X, pow2)
   EVT VT = N->getValueType(0);
   if ((VT != MVT::i32 && VT != MVT::i64) ||
       !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
     return SDValue();
 
   SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
   unsigned Lg2 = Divisor.countTrailingZeros();
   SDValue Zero = DAG.getConstant(0, VT);
   SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT);
 
   // Add (N0 < 0) ? Pow2 - 1 : 0;
   SDValue CCVal;
   SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
   SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
   SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
 
   if (Created) {
     Created->push_back(Cmp.getNode());
     Created->push_back(Add.getNode());
     Created->push_back(CSel.getNode());
   }
 
   // Divide by pow2.
   SDValue SRA =
       DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64));
 
   // If we're dividing by a positive value, we're done.  Otherwise, we must
   // negate the result.
   if (Divisor.isNonNegative())
     return SRA;
 
   if (Created)
     Created->push_back(SRA.getNode());
   return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA);
 }
 
 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const AArch64Subtarget *Subtarget) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   // Multiplication of a power of two plus/minus one can be done more
   // cheaply as as shift+add/sub. For now, this is true unilaterally. If
   // future CPUs have a cheaper MADD instruction, this may need to be
   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
   // 64-bit is 5 cycles, so this is always a win.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
     APInt Value = C->getAPIntValue();
     EVT VT = N->getValueType(0);
     if (Value.isNonNegative()) {
       // (mul x, 2^N + 1) => (add (shl x, N), x)
       APInt VM1 = Value - 1;
       if (VM1.isPowerOf2()) {
         SDValue ShiftedVal =
             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
                         DAG.getConstant(VM1.logBase2(), MVT::i64));
         return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal,
                            N->getOperand(0));
       }
       // (mul x, 2^N - 1) => (sub (shl x, N), x)
       APInt VP1 = Value + 1;
       if (VP1.isPowerOf2()) {
         SDValue ShiftedVal =
             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
                         DAG.getConstant(VP1.logBase2(), MVT::i64));
         return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal,
                            N->getOperand(0));
       }
     } else {
       // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
       APInt VNM1 = -Value - 1;
       if (VNM1.isPowerOf2()) {
         SDValue ShiftedVal =
             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
                         DAG.getConstant(VNM1.logBase2(), MVT::i64));
         SDValue Add =
             DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
         return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add);
       }
       // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
       APInt VNP1 = -Value + 1;
       if (VNP1.isPowerOf2()) {
         SDValue ShiftedVal =
             DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
                         DAG.getConstant(VNP1.logBase2(), MVT::i64));
         return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0),
                            ShiftedVal);
       }
     }
   }
   return SDValue();
 }
 
 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
                                                          SelectionDAG &DAG) {
   // Take advantage of vector comparisons producing 0 or -1 in each lane to
   // optimize away operation when it's from a constant.
   //
   // The general transformation is:
   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
   //       AND(VECTOR_CMP(x,y), constant2)
   //    constant2 = UNARYOP(constant)
 
   // Early exit if this isn't a vector operation, the operand of the
   // unary operation isn't a bitwise AND, or if the sizes of the operations
   // aren't the same.
   EVT VT = N->getValueType(0);
   if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
       N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
       VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
     return SDValue();
 
   // Now check that the other operand of the AND is a constant. We could
   // make the transformation for non-constant splats as well, but it's unclear
   // that would be a benefit as it would not eliminate any operations, just
   // perform one more step in scalar code before moving to the vector unit.
   if (BuildVectorSDNode *BV =
           dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
     // Bail out if the vector isn't a constant.
     if (!BV->isConstant())
       return SDValue();
 
     // Everything checks out. Build up the new and improved node.
     SDLoc DL(N);
     EVT IntVT = BV->getValueType(0);
     // Create a new constant of the appropriate type for the transformed
     // DAG.
     SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
     // The AND node needs bitcasts to/from an integer vector type around it.
     SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
     SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
                                  N->getOperand(0)->getOperand(0), MaskConst);
     SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
     return Res;
   }
 
   return SDValue();
 }
 
 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
                                      const AArch64Subtarget *Subtarget) {
   // First try to optimize away the conversion when it's conditionally from
   // a constant. Vectors only.
   SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG);
   if (Res != SDValue())
     return Res;
 
   EVT VT = N->getValueType(0);
   if (VT != MVT::f32 && VT != MVT::f64)
     return SDValue();
 
   // Only optimize when the source and destination types have the same width.
   if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
     return SDValue();
 
   // If the result of an integer load is only used by an integer-to-float
   // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
   // This eliminates an "integer-to-vector-move UOP and improve throughput.
   SDValue N0 = N->getOperand(0);
   if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
                                LN0->getPointerInfo(), LN0->isVolatile(),
                                LN0->isNonTemporal(), LN0->isInvariant(),
                                LN0->getAlignment());
 
     // Make sure successors of the original load stay after it by updating them
     // to use the new Chain.
     DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
 
     unsigned Opcode =
         (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
     return DAG.getNode(Opcode, SDLoc(N), VT, Load);
   }
 
   return SDValue();
 }
 
 /// An EXTR instruction is made up of two shifts, ORed together. This helper
 /// searches for and classifies those shifts.
 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
                          bool &FromHi) {
   if (N.getOpcode() == ISD::SHL)
     FromHi = false;
   else if (N.getOpcode() == ISD::SRL)
     FromHi = true;
   else
     return false;
 
   if (!isa<ConstantSDNode>(N.getOperand(1)))
     return false;
 
   ShiftAmount = N->getConstantOperandVal(1);
   Src = N->getOperand(0);
   return true;
 }
 
 /// EXTR instruction extracts a contiguous chunk of bits from two existing
 /// registers viewed as a high/low pair. This function looks for the pattern:
 /// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
 /// EXTR. Can't quite be done in TableGen because the two immediates aren't
 /// independent.
 static SDValue tryCombineToEXTR(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   assert(N->getOpcode() == ISD::OR && "Unexpected root");
 
   if (VT != MVT::i32 && VT != MVT::i64)
     return SDValue();
 
   SDValue LHS;
   uint32_t ShiftLHS = 0;
   bool LHSFromHi = 0;
   if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
     return SDValue();
 
   SDValue RHS;
   uint32_t ShiftRHS = 0;
   bool RHSFromHi = 0;
   if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
     return SDValue();
 
   // If they're both trying to come from the high part of the register, they're
   // not really an EXTR.
   if (LHSFromHi == RHSFromHi)
     return SDValue();
 
   if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
     return SDValue();
 
   if (LHSFromHi) {
     std::swap(LHS, RHS);
     std::swap(ShiftLHS, ShiftRHS);
   }
 
   return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
                      DAG.getConstant(ShiftRHS, MVT::i64));
 }
 
 static SDValue tryCombineToBSL(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI) {
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
 
   if (!VT.isVector())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
   if (N0.getOpcode() != ISD::AND)
     return SDValue();
 
   SDValue N1 = N->getOperand(1);
   if (N1.getOpcode() != ISD::AND)
     return SDValue();
 
   // We only have to look for constant vectors here since the general, variable
   // case can be handled in TableGen.
   unsigned Bits = VT.getVectorElementType().getSizeInBits();
   uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
   for (int i = 1; i >= 0; --i)
     for (int j = 1; j >= 0; --j) {
       BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
       BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
       if (!BVN0 || !BVN1)
         continue;
 
       bool FoundMatch = true;
       for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
         ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
         ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
         if (!CN0 || !CN1 ||
             CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
           FoundMatch = false;
           break;
         }
       }
 
       if (FoundMatch)
         return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
     }
 
   return SDValue();
 }
 
 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                                 const AArch64Subtarget *Subtarget) {
   // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
   if (!EnableAArch64ExtrGeneration)
     return SDValue();
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
   if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
   SDValue Res = tryCombineToEXTR(N, DCI);
   if (Res.getNode())
     return Res;
 
   Res = tryCombineToBSL(N, DCI);
   if (Res.getNode())
     return Res;
 
   return SDValue();
 }
 
 static SDValue performBitcastCombine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      SelectionDAG &DAG) {
   // Wait 'til after everything is legalized to try this. That way we have
   // legal vector types and such.
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   // Remove extraneous bitcasts around an extract_subvector.
   // For example,
   //    (v4i16 (bitconvert
   //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
   //  becomes
   //    (extract_subvector ((v8i16 ...), (i64 4)))
 
   // Only interested in 64-bit vectors as the ultimate result.
   EVT VT = N->getValueType(0);
   if (!VT.isVector())
     return SDValue();
   if (VT.getSimpleVT().getSizeInBits() != 64)
     return SDValue();
   // Is the operand an extract_subvector starting at the beginning or halfway
   // point of the vector? A low half may also come through as an
   // EXTRACT_SUBREG, so look for that, too.
   SDValue Op0 = N->getOperand(0);
   if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
       !(Op0->isMachineOpcode() &&
         Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
     return SDValue();
   uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
   if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
     if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
       return SDValue();
   } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
     if (idx != AArch64::dsub)
       return SDValue();
     // The dsub reference is equivalent to a lane zero subvector reference.
     idx = 0;
   }
   // Look through the bitcast of the input to the extract.
   if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
     return SDValue();
   SDValue Source = Op0->getOperand(0)->getOperand(0);
   // If the source type has twice the number of elements as our destination
   // type, we know this is an extract of the high or low half of the vector.
   EVT SVT = Source->getValueType(0);
   if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
     return SDValue();
 
   DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
 
   // Create the simplified form to just extract the low or high half of the
   // vector directly rather than bothering with the bitcasts.
   SDLoc dl(N);
   unsigned NumElements = VT.getVectorNumElements();
   if (idx) {
     SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
   } else {
     SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32);
     return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
                                       Source, SubReg),
                    0);
   }
 }
 
 static SDValue performConcatVectorsCombine(SDNode *N,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            SelectionDAG &DAG) {
   // Wait 'til after everything is legalized to try this. That way we have
   // legal vector types and such.
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
   // canonicalise to that.
   if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
     assert(VT.getVectorElementType().getSizeInBits() == 64);
     return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
                        WidenVector(N->getOperand(0), DAG),
                        DAG.getConstant(0, MVT::i64));
   }
 
   // Canonicalise concat_vectors so that the right-hand vector has as few
   // bit-casts as possible before its real operation. The primary matching
   // destination for these operations will be the narrowing "2" instructions,
   // which depend on the operation being performed on this right-hand vector.
   // For example,
   //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
   // becomes
   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
 
   SDValue Op1 = N->getOperand(1);
   if (Op1->getOpcode() != ISD::BITCAST)
     return SDValue();
   SDValue RHS = Op1->getOperand(0);
   MVT RHSTy = RHS.getValueType().getSimpleVT();
   // If the RHS is not a vector, this is not the pattern we're looking for.
   if (!RHSTy.isVector())
     return SDValue();
 
   DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
 
   MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
                                   RHSTy.getVectorNumElements() * 2);
   return DAG.getNode(
       ISD::BITCAST, dl, VT,
       DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
                   DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
 }
 
 static SDValue tryCombineFixedPointConvert(SDNode *N,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            SelectionDAG &DAG) {
   // Wait 'til after everything is legalized to try this. That way we have
   // legal vector types and such.
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
   // Transform a scalar conversion of a value from a lane extract into a
   // lane extract of a vector conversion. E.g., from foo1 to foo2:
   // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
   // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
   //
   // The second form interacts better with instruction selection and the
   // register allocator to avoid cross-class register copies that aren't
   // coalescable due to a lane reference.
 
   // Check the operand and see if it originates from a lane extract.
   SDValue Op1 = N->getOperand(1);
   if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     // Yep, no additional predication needed. Perform the transform.
     SDValue IID = N->getOperand(0);
     SDValue Shift = N->getOperand(2);
     SDValue Vec = Op1.getOperand(0);
     SDValue Lane = Op1.getOperand(1);
     EVT ResTy = N->getValueType(0);
     EVT VecResTy;
     SDLoc DL(N);
 
     // The vector width should be 128 bits by the time we get here, even
     // if it started as 64 bits (the extract_vector handling will have
     // done so).
     assert(Vec.getValueType().getSizeInBits() == 128 &&
            "unexpected vector size on extract_vector_elt!");
     if (Vec.getValueType() == MVT::v4i32)
       VecResTy = MVT::v4f32;
     else if (Vec.getValueType() == MVT::v2i64)
       VecResTy = MVT::v2f64;
     else
       llvm_unreachable("unexpected vector type!");
 
     SDValue Convert =
         DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
   }
   return SDValue();
 }
 
 // AArch64 high-vector "long" operations are formed by performing the non-high
 // version on an extract_subvector of each operand which gets the high half:
 //
 //  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
 //
 // However, there are cases which don't have an extract_high explicitly, but
 // have another operation that can be made compatible with one for free. For
 // example:
 //
 //  (dupv64 scalar) --> (extract_high (dup128 scalar))
 //
 // This routine does the actual conversion of such DUPs, once outer routines
 // have determined that everything else is in order.
 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
   // We can handle most types of duplicate, but the lane ones have an extra
   // operand saying *which* lane, so we need to know.
   bool IsDUPLANE;
   switch (N.getOpcode()) {
   case AArch64ISD::DUP:
     IsDUPLANE = false;
     break;
   case AArch64ISD::DUPLANE8:
   case AArch64ISD::DUPLANE16:
   case AArch64ISD::DUPLANE32:
   case AArch64ISD::DUPLANE64:
     IsDUPLANE = true;
     break;
   default:
     return SDValue();
   }
 
   MVT NarrowTy = N.getSimpleValueType();
   if (!NarrowTy.is64BitVector())
     return SDValue();
 
   MVT ElementTy = NarrowTy.getVectorElementType();
   unsigned NumElems = NarrowTy.getVectorNumElements();
   MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
 
   SDValue NewDUP;
   if (IsDUPLANE)
     NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
                          N.getOperand(1));
   else
     NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
 
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
                      NewDUP, DAG.getConstant(NumElems, MVT::i64));
 }
 
 static bool isEssentiallyExtractSubvector(SDValue N) {
   if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
     return true;
 
   return N.getOpcode() == ISD::BITCAST &&
          N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
 }
 
 /// \brief Helper structure to keep track of ISD::SET_CC operands.
 struct GenericSetCCInfo {
   const SDValue *Opnd0;
   const SDValue *Opnd1;
   ISD::CondCode CC;
 };
 
 /// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
 struct AArch64SetCCInfo {
   const SDValue *Cmp;
   AArch64CC::CondCode CC;
 };
 
 /// \brief Helper structure to keep track of SetCC information.
 union SetCCInfo {
   GenericSetCCInfo Generic;
   AArch64SetCCInfo AArch64;
 };
 
 /// \brief Helper structure to be able to read SetCC information.  If set to
 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
 /// GenericSetCCInfo.
 struct SetCCInfoAndKind {
   SetCCInfo Info;
   bool IsAArch64;
 };
 
 /// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
 /// an
 /// AArch64 lowered one.
 /// \p SetCCInfo is filled accordingly.
 /// \post SetCCInfo is meanginfull only when this function returns true.
 /// \return True when Op is a kind of SET_CC operation.
 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
   // If this is a setcc, this is straight forward.
   if (Op.getOpcode() == ISD::SETCC) {
     SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
     SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
     SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
     SetCCInfo.IsAArch64 = false;
     return true;
   }
   // Otherwise, check if this is a matching csel instruction.
   // In other words:
   // - csel 1, 0, cc
   // - csel 0, 1, !cc
   if (Op.getOpcode() != AArch64ISD::CSEL)
     return false;
   // Set the information about the operands.
   // TODO: we want the operands of the Cmp not the csel
   SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
   SetCCInfo.IsAArch64 = true;
   SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
       cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
 
   // Check that the operands matches the constraints:
   // (1) Both operands must be constants.
   // (2) One must be 1 and the other must be 0.
   ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
   ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
 
   // Check (1).
   if (!TValue || !FValue)
     return false;
 
   // Check (2).
   if (!TValue->isOne()) {
     // Update the comparison when we are interested in !cc.
     std::swap(TValue, FValue);
     SetCCInfo.Info.AArch64.CC =
         AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
   }
   return TValue->isOne() && FValue->isNullValue();
 }
 
 // Returns true if Op is setcc or zext of setcc.
 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
   if (isSetCC(Op, Info))
     return true;
   return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
     isSetCC(Op->getOperand(0), Info));
 }
 
 // The folding we want to perform is:
 // (add x, [zext] (setcc cc ...) )
 //   -->
 // (csel x, (add x, 1), !cc ...)
 //
 // The latter will get matched to a CSINC instruction.
 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
   assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
   SDValue LHS = Op->getOperand(0);
   SDValue RHS = Op->getOperand(1);
   SetCCInfoAndKind InfoAndKind;
 
   // If neither operand is a SET_CC, give up.
   if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
     std::swap(LHS, RHS);
     if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
       return SDValue();
   }
 
   // FIXME: This could be generatized to work for FP comparisons.
   EVT CmpVT = InfoAndKind.IsAArch64
                   ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
                   : InfoAndKind.Info.Generic.Opnd0->getValueType();
   if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
     return SDValue();
 
   SDValue CCVal;
   SDValue Cmp;
   SDLoc dl(Op);
   if (InfoAndKind.IsAArch64) {
     CCVal = DAG.getConstant(
         AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32);
     Cmp = *InfoAndKind.Info.AArch64.Cmp;
   } else
     Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
                       *InfoAndKind.Info.Generic.Opnd1,
                       ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
                       CCVal, DAG, dl);
 
   EVT VT = Op->getValueType(0);
   LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
   return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
 }
 
 // The basic add/sub long vector instructions have variants with "2" on the end
 // which act on the high-half of their inputs. They are normally matched by
 // patterns like:
 //
 // (add (zeroext (extract_high LHS)),
 //      (zeroext (extract_high RHS)))
 // -> uaddl2 vD, vN, vM
 //
 // However, if one of the extracts is something like a duplicate, this
 // instruction can still be used profitably. This function puts the DAG into a
 // more appropriate form for those patterns to trigger.
 static SDValue performAddSubLongCombine(SDNode *N,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         SelectionDAG &DAG) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   MVT VT = N->getSimpleValueType(0);
   if (!VT.is128BitVector()) {
     if (N->getOpcode() == ISD::ADD)
       return performSetccAddFolding(N, DAG);
     return SDValue();
   }
 
   // Make sure both branches are extended in the same way.
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
        LHS.getOpcode() != ISD::SIGN_EXTEND) ||
       LHS.getOpcode() != RHS.getOpcode())
     return SDValue();
 
   unsigned ExtType = LHS.getOpcode();
 
   // It's not worth doing if at least one of the inputs isn't already an
   // extract, but we don't know which it'll be so we have to try both.
   if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
     RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
     if (!RHS.getNode())
       return SDValue();
 
     RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
   } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
     LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
     if (!LHS.getNode())
       return SDValue();
 
     LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
   }
 
   return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
 }
 
 // Massage DAGs which we can use the high-half "long" operations on into
 // something isel will recognize better. E.g.
 //
 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
 //   (aarch64_neon_umull (extract_high (v2i64 vec)))
 //                     (extract_high (v2i64 (dup128 scalar)))))
 //
 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        SelectionDAG &DAG) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   assert(LHS.getValueType().is64BitVector() &&
          RHS.getValueType().is64BitVector() &&
          "unexpected shape for long operation");
 
   // Either node could be a DUP, but it's not worth doing both of them (you'd
   // just as well use the non-high version) so look for a corresponding extract
   // operation on the other "wing".
   if (isEssentiallyExtractSubvector(LHS)) {
     RHS = tryExtendDUPToExtractHigh(RHS, DAG);
     if (!RHS.getNode())
       return SDValue();
   } else if (isEssentiallyExtractSubvector(RHS)) {
     LHS = tryExtendDUPToExtractHigh(LHS, DAG);
     if (!LHS.getNode())
       return SDValue();
   }
 
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
                      N->getOperand(0), LHS, RHS);
 }
 
 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
   MVT ElemTy = N->getSimpleValueType(0).getScalarType();
   unsigned ElemBits = ElemTy.getSizeInBits();
 
   int64_t ShiftAmount;
   if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
     APInt SplatValue, SplatUndef;
     unsigned SplatBitSize;
     bool HasAnyUndefs;
     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
                               HasAnyUndefs, ElemBits) ||
         SplatBitSize != ElemBits)
       return SDValue();
 
     ShiftAmount = SplatValue.getSExtValue();
   } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
     ShiftAmount = CVN->getSExtValue();
   } else
     return SDValue();
 
   unsigned Opcode;
   bool IsRightShift;
   switch (IID) {
   default:
     llvm_unreachable("Unknown shift intrinsic");
   case Intrinsic::aarch64_neon_sqshl:
     Opcode = AArch64ISD::SQSHL_I;
     IsRightShift = false;
     break;
   case Intrinsic::aarch64_neon_uqshl:
     Opcode = AArch64ISD::UQSHL_I;
     IsRightShift = false;
     break;
   case Intrinsic::aarch64_neon_srshl:
     Opcode = AArch64ISD::SRSHR_I;
     IsRightShift = true;
     break;
   case Intrinsic::aarch64_neon_urshl:
     Opcode = AArch64ISD::URSHR_I;
     IsRightShift = true;
     break;
   case Intrinsic::aarch64_neon_sqshlu:
     Opcode = AArch64ISD::SQSHLU_I;
     IsRightShift = false;
     break;
   }
 
   if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
     return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
                        DAG.getConstant(-ShiftAmount, MVT::i32));
   else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits)
     return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
                        DAG.getConstant(ShiftAmount, MVT::i32));
 
   return SDValue();
 }
 
 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
 // the intrinsics must be legal and take an i32, this means there's almost
 // certainly going to be a zext in the DAG which we can eliminate.
 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
   SDValue AndN = N->getOperand(2);
   if (AndN.getOpcode() != ISD::AND)
     return SDValue();
 
   ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
   if (!CMask || CMask->getZExtValue() != Mask)
     return SDValue();
 
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
                      N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
 }
 
 static SDValue performIntrinsicCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const AArch64Subtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
   unsigned IID = getIntrinsicID(N);
   switch (IID) {
   default:
     break;
   case Intrinsic::aarch64_neon_vcvtfxs2fp:
   case Intrinsic::aarch64_neon_vcvtfxu2fp:
     return tryCombineFixedPointConvert(N, DCI, DAG);
     break;
   case Intrinsic::aarch64_neon_fmax:
     return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_fmin:
     return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2));
   case Intrinsic::aarch64_neon_smull:
   case Intrinsic::aarch64_neon_umull:
   case Intrinsic::aarch64_neon_pmull:
   case Intrinsic::aarch64_neon_sqdmull:
     return tryCombineLongOpWithDup(IID, N, DCI, DAG);
   case Intrinsic::aarch64_neon_sqshl:
   case Intrinsic::aarch64_neon_uqshl:
   case Intrinsic::aarch64_neon_sqshlu:
   case Intrinsic::aarch64_neon_srshl:
   case Intrinsic::aarch64_neon_urshl:
     return tryCombineShiftImm(IID, N, DAG);
   case Intrinsic::aarch64_crc32b:
   case Intrinsic::aarch64_crc32cb:
     return tryCombineCRC32(0xff, N, DAG);
   case Intrinsic::aarch64_crc32h:
   case Intrinsic::aarch64_crc32ch:
     return tryCombineCRC32(0xffff, N, DAG);
   }
   return SDValue();
 }
 
 static SDValue performExtendCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
   // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
   // we can convert that DUP into another extract_high (of a bigger DUP), which
   // helps the backend to decide that an sabdl2 would be useful, saving a real
   // extract_high operation.
   if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
       N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
     SDNode *ABDNode = N->getOperand(0).getNode();
     unsigned IID = getIntrinsicID(ABDNode);
     if (IID == Intrinsic::aarch64_neon_sabd ||
         IID == Intrinsic::aarch64_neon_uabd) {
       SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
       if (!NewABD.getNode())
         return SDValue();
 
       return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
                          NewABD);
     }
   }
 
   // This is effectively a custom type legalization for AArch64.
   //
   // Type legalization will split an extend of a small, legal, type to a larger
   // illegal type by first splitting the destination type, often creating
   // illegal source types, which then get legalized in isel-confusing ways,
   // leading to really terrible codegen. E.g.,
   //   %result = v8i32 sext v8i8 %value
   // becomes
   //   %losrc = extract_subreg %value, ...
   //   %hisrc = extract_subreg %value, ...
   //   %lo = v4i32 sext v4i8 %losrc
   //   %hi = v4i32 sext v4i8 %hisrc
   // Things go rapidly downhill from there.
   //
   // For AArch64, the [sz]ext vector instructions can only go up one element
   // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
   // take two instructions.
   //
   // This implies that the most efficient way to do the extend from v8i8
   // to two v4i32 values is to first extend the v8i8 to v8i16, then do
   // the normal splitting to happen for the v8i16->v8i32.
 
   // This is pre-legalization to catch some cases where the default
   // type legalization will create ill-tempered code.
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
   // We're only interested in cleaning things up for non-legal vector types
   // here. If both the source and destination are legal, things will just
   // work naturally without any fiddling.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT ResVT = N->getValueType(0);
   if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
     return SDValue();
   // If the vector type isn't a simple VT, it's beyond the scope of what
   // we're  worried about here. Let legalization do its thing and hope for
   // the best.
   SDValue Src = N->getOperand(0);
   EVT SrcVT = Src->getValueType(0);
   if (!ResVT.isSimple() || !SrcVT.isSimple())
     return SDValue();
 
   // If the source VT is a 64-bit vector, we can play games and get the
   // better results we want.
   if (SrcVT.getSizeInBits() != 64)
     return SDValue();
 
   unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
   unsigned ElementCount = SrcVT.getVectorNumElements();
   SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
   SDLoc DL(N);
   Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
 
   // Now split the rest of the operation into two halves, each with a 64
   // bit source.
   EVT LoVT, HiVT;
   SDValue Lo, Hi;
   unsigned NumElements = ResVT.getVectorNumElements();
   assert(!(NumElements & 1) && "Splitting vector, but not in half!");
   LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
                                  ResVT.getVectorElementType(), NumElements / 2);
 
   EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
                                LoVT.getVectorNumElements());
   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
                    DAG.getConstant(0, MVT::i64));
   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
                    DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64));
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
 
   // Now combine the parts back together so we still have a single result
   // like the combiner expects.
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
 }
 
 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
 /// value. The load store optimizer pass will merge them to store pair stores.
 /// This has better performance than a splat of the scalar followed by a split
 /// vector store. Even if the stores are not merged it is four stores vs a dup,
 /// followed by an ext.b and two stores.
 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
   SDValue StVal = St->getValue();
   EVT VT = StVal.getValueType();
 
   // Don't replace floating point stores, they possibly won't be transformed to
   // stp because of the store pair suppress pass.
   if (VT.isFloatingPoint())
     return SDValue();
 
   // Check for insert vector elements.
   if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
     return SDValue();
 
   // We can express a splat as store pair(s) for 2 or 4 elements.
   unsigned NumVecElts = VT.getVectorNumElements();
   if (NumVecElts != 4 && NumVecElts != 2)
     return SDValue();
   SDValue SplatVal = StVal.getOperand(1);
   unsigned RemainInsertElts = NumVecElts - 1;
 
   // Check that this is a splat.
   while (--RemainInsertElts) {
     SDValue NextInsertElt = StVal.getOperand(0);
     if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
       return SDValue();
     if (NextInsertElt.getOperand(1) != SplatVal)
       return SDValue();
     StVal = NextInsertElt;
   }
   unsigned OrigAlignment = St->getAlignment();
   unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
   unsigned Alignment = std::min(OrigAlignment, EltOffset);
 
   // Create scalar stores. This is at least as good as the code sequence for a
   // split unaligned store wich is a dup.s, ext.b, and two stores.
   // Most of the time the three stores should be replaced by store pair
   // instructions (stp).
   SDLoc DL(St);
   SDValue BasePtr = St->getBasePtr();
   SDValue NewST1 =
       DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
                    St->isVolatile(), St->isNonTemporal(), St->getAlignment());
 
   unsigned Offset = EltOffset;
   while (--NumVecElts) {
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                     DAG.getConstant(Offset, MVT::i64));
     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
                           St->getPointerInfo(), St->isVolatile(),
                           St->isNonTemporal(), Alignment);
     Offset += EltOffset;
   }
   return NewST1;
 }
 
 static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
                                    const AArch64Subtarget *Subtarget) {
   if (!DCI.isBeforeLegalize())
     return SDValue();
 
   StoreSDNode *S = cast<StoreSDNode>(N);
   if (S->isVolatile())
     return SDValue();
 
   // Cyclone has bad performance on unaligned 16B stores when crossing line and
   // page boundries. We want to split such stores.
   if (!Subtarget->isCyclone())
     return SDValue();
 
   // Don't split at Oz.
   MachineFunction &MF = DAG.getMachineFunction();
   bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
       AttributeSet::FunctionIndex, Attribute::MinSize);
   if (IsMinSize)
     return SDValue();
 
   SDValue StVal = S->getValue();
   EVT VT = StVal.getValueType();
 
   // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
   // those up regresses performance on micro-benchmarks and olden/bh.
   if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
     return SDValue();
 
   // Split unaligned 16B stores. They are terrible for performance.
   // Don't split stores with alignment of 1 or 2. Code that uses clang vector
   // extensions can use this to mark that it does not want splitting to happen
   // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
   // eliminating alignment hazards is only 1 in 8 for alignment of 2.
   if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
       S->getAlignment() <= 2)
     return SDValue();
 
   // If we get a splat of a scalar convert this vector store to a store of
   // scalars. They will be merged into store pairs thereby removing two
   // instructions.
   SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
   if (ReplacedSplat != SDValue())
     return ReplacedSplat;
 
   SDLoc DL(S);
   unsigned NumElts = VT.getVectorNumElements() / 2;
   // Split VT into two.
   EVT HalfVT =
       EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
   SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
                                    DAG.getConstant(0, MVT::i64));
   SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
                                    DAG.getConstant(NumElts, MVT::i64));
   SDValue BasePtr = S->getBasePtr();
   SDValue NewST1 =
       DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
                    S->isVolatile(), S->isNonTemporal(), S->getAlignment());
   SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                   DAG.getConstant(8, MVT::i64));
   return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
                       S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
                       S->getAlignment());
 }
 
 /// Target-specific DAG combine function for post-increment LD1 (lane) and
 /// post-increment LD1R.
 static SDValue performPostLD1Combine(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      bool IsLaneOp) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
   unsigned LoadIdx = IsLaneOp ? 1 : 0;
   SDNode *LD = N->getOperand(LoadIdx).getNode();
   // If it is not LOAD, can not do such combine.
   if (LD->getOpcode() != ISD::LOAD)
     return SDValue();
 
   LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
   EVT MemVT = LoadSDN->getMemoryVT();
   // Check if memory operand is the same type as the vector element.
   if (MemVT != VT.getVectorElementType())
     return SDValue();
 
   // Check if there are other uses. If so, do not combine as it will introduce
   // an extra load.
   for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
        ++UI) {
     if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
       continue;
     if (*UI != N)
       return SDValue();
   }
 
   SDValue Addr = LD->getOperand(1);
   SDValue Vector = N->getOperand(0);
   // Search for a use of the address operand that is an increment.
   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
        Addr.getNode()->use_end(); UI != UE; ++UI) {
     SDNode *User = *UI;
     if (User->getOpcode() != ISD::ADD
         || UI.getUse().getResNo() != Addr.getResNo())
       continue;
 
     // Check that the add is independent of the load.  Otherwise, folding it
     // would create a cycle.
     if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
       continue;
     // Also check that add is not used in the vector operand.  This would also
     // create a cycle.
     if (User->isPredecessorOf(Vector.getNode()))
       continue;
 
     // If the increment is a constant, it must match the memory ref size.
     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
       uint32_t IncVal = CInc->getZExtValue();
       unsigned NumBytes = VT.getScalarSizeInBits() / 8;
       if (IncVal != NumBytes)
         continue;
       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
     }
 
     SmallVector<SDValue, 8> Ops;
     Ops.push_back(LD->getOperand(0));  // Chain
     if (IsLaneOp) {
       Ops.push_back(Vector);           // The vector to be inserted
       Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
     }
     Ops.push_back(Addr);
     Ops.push_back(Inc);
 
     EVT Tys[3] = { VT, MVT::i64, MVT::Other };
     SDVTList SDTys = DAG.getVTList(Tys);
     unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
                                            MemVT,
                                            LoadSDN->getMemOperand());
 
     // Update the uses.
     std::vector<SDValue> NewResults;
     NewResults.push_back(SDValue(LD, 0));             // The result of load
     NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
     DCI.CombineTo(LD, NewResults);
     DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
     DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
 
     break;
   }
   return SDValue();
 }
 
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
 static SDValue performNEONPostLDSTCombine(SDNode *N,
                                           TargetLowering::DAGCombinerInfo &DCI,
                                           SelectionDAG &DAG) {
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
   unsigned AddrOpIdx = N->getNumOperands() - 1;
   SDValue Addr = N->getOperand(AddrOpIdx);
 
   // Search for a use of the address operand that is an increment.
   for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
        UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
     SDNode *User = *UI;
     if (User->getOpcode() != ISD::ADD ||
         UI.getUse().getResNo() != Addr.getResNo())
       continue;
 
     // Check that the add is independent of the load/store.  Otherwise, folding
     // it would create a cycle.
     if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
       continue;
 
     // Find the new opcode for the updating load/store.
     bool IsStore = false;
     bool IsLaneOp = false;
     bool IsDupOp = false;
     unsigned NewOpc = 0;
     unsigned NumVecs = 0;
     unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
     switch (IntNo) {
     default: llvm_unreachable("unexpected intrinsic for Neon base update");
     case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
       NumVecs = 2; break;
     case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
       NumVecs = 3; break;
     case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
       NumVecs = 4; break;
     case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
       NumVecs = 2; IsStore = true; break;
     case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
       NumVecs = 3; IsStore = true; break;
     case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
       NumVecs = 4; IsStore = true; break;
     case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
       NumVecs = 2; break;
     case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
       NumVecs = 3; break;
     case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
       NumVecs = 4; break;
     case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
       NumVecs = 2; IsStore = true; break;
     case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
       NumVecs = 3; IsStore = true; break;
     case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
       NumVecs = 4; IsStore = true; break;
     case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
       NumVecs = 2; IsDupOp = true; break;
     case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
       NumVecs = 3; IsDupOp = true; break;
     case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
       NumVecs = 4; IsDupOp = true; break;
     case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
       NumVecs = 2; IsLaneOp = true; break;
     case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
       NumVecs = 3; IsLaneOp = true; break;
     case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
       NumVecs = 4; IsLaneOp = true; break;
     case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
       NumVecs = 2; IsStore = true; IsLaneOp = true; break;
     case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
       NumVecs = 3; IsStore = true; IsLaneOp = true; break;
     case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
       NumVecs = 4; IsStore = true; IsLaneOp = true; break;
     }
 
     EVT VecTy;
     if (IsStore)
       VecTy = N->getOperand(2).getValueType();
     else
       VecTy = N->getValueType(0);
 
     // If the increment is a constant, it must match the memory ref size.
     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
       uint32_t IncVal = CInc->getZExtValue();
       unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
       if (IsLaneOp || IsDupOp)
         NumBytes /= VecTy.getVectorNumElements();
       if (IncVal != NumBytes)
         continue;
       Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
     }
     SmallVector<SDValue, 8> Ops;
     Ops.push_back(N->getOperand(0)); // Incoming chain
     // Load lane and store have vector list as input.
     if (IsLaneOp || IsStore)
       for (unsigned i = 2; i < AddrOpIdx; ++i)
         Ops.push_back(N->getOperand(i));
     Ops.push_back(Addr); // Base register
     Ops.push_back(Inc);
 
     // Return Types.
     EVT Tys[6];
     unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
     unsigned n;
     for (n = 0; n < NumResultVecs; ++n)
       Tys[n] = VecTy;
     Tys[n++] = MVT::i64;  // Type of write back register
     Tys[n] = MVT::Other;  // Type of the chain
     SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
 
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
                                            MemInt->getMemoryVT(),
                                            MemInt->getMemOperand());
 
     // Update the uses.
     std::vector<SDValue> NewResults;
     for (unsigned i = 0; i < NumResultVecs; ++i) {
       NewResults.push_back(SDValue(UpdN.getNode(), i));
     }
     NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
     DCI.CombineTo(N, NewResults);
     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
 
     break;
   }
   return SDValue();
 }
 
 // Checks to see if the value is the prescribed width and returns information
 // about its extension mode.
 static
 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
   ExtType = ISD::NON_EXTLOAD;
   switch(V.getNode()->getOpcode()) {
   default:
     return false;
   case ISD::LOAD: {
     LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
     if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
        || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
       ExtType = LoadNode->getExtensionType();
       return true;
     }
     return false;
   }
   case ISD::AssertSext: {
     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
     if ((TypeNode->getVT() == MVT::i8 && width == 8)
        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
       ExtType = ISD::SEXTLOAD;
       return true;
     }
     return false;
   }
   case ISD::AssertZext: {
     VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
     if ((TypeNode->getVT() == MVT::i8 && width == 8)
        || (TypeNode->getVT() == MVT::i16 && width == 16)) {
       ExtType = ISD::ZEXTLOAD;
       return true;
     }
     return false;
   }
   case ISD::Constant:
   case ISD::TargetConstant: {
     if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
         1LL << (width - 1))
       return true;
     return false;
   }
   }
 
   return true;
 }
 
 // This function does a whole lot of voodoo to determine if the tests are
 // equivalent without and with a mask. Essentially what happens is that given a
 // DAG resembling:
 //
 //  +-------------+ +-------------+ +-------------+ +-------------+
 //  |    Input    | | AddConstant | | CompConstant| |     CC      |
 //  +-------------+ +-------------+ +-------------+ +-------------+
 //           |           |           |               |
 //           V           V           |    +----------+
 //          +-------------+  +----+  |    |
 //          |     ADD     |  |0xff|  |    |
 //          +-------------+  +----+  |    |
 //                  |           |    |    |
 //                  V           V    |    |
 //                 +-------------+   |    |
 //                 |     AND     |   |    |
 //                 +-------------+   |    |
 //                      |            |    |
 //                      +-----+      |    |
 //                            |      |    |
 //                            V      V    V
 //                           +-------------+
 //                           |     CMP     |
 //                           +-------------+
 //
 // The AND node may be safely removed for some combinations of inputs. In
 // particular we need to take into account the extension type of the Input,
 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
 // width of the input (this can work for any width inputs, the above graph is
 // specific to 8 bits.
 //
 // The specific equations were worked out by generating output tables for each
 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
 // problem was simplified by working with 4 bit inputs, which means we only
 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
 // patterns present in both extensions (0,7). For every distinct set of
 // AddConstant and CompConstants bit patterns we can consider the masked and
 // unmasked versions to be equivalent if the result of this function is true for
 // all 16 distinct bit patterns of for the current extension type of Input (w0).
 //
 //   sub      w8, w0, w1
 //   and      w10, w8, #0x0f
 //   cmp      w8, w2
 //   cset     w9, AArch64CC
 //   cmp      w10, w2
 //   cset     w11, AArch64CC
 //   cmp      w9, w11
 //   cset     w0, eq
 //   ret
 //
 // Since the above function shows when the outputs are equivalent it defines
 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
 // would be expensive to run during compiles. The equations below were written
 // in a test harness that confirmed they gave equivalent outputs to the above
 // for all inputs function, so they can be used determine if the removal is
 // legal instead.
 //
 // isEquivalentMaskless() is the code for testing if the AND can be removed
 // factored out of the DAG recognition as the DAG can take several forms.
 
 static
 bool isEquivalentMaskless(unsigned CC, unsigned width,
                           ISD::LoadExtType ExtType, signed AddConstant,
                           signed CompConstant) {
   // By being careful about our equations and only writing the in term
   // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
   // make them generally applicable to all bit widths.
   signed MaxUInt = (1 << width);
 
   // For the purposes of these comparisons sign extending the type is
   // equivalent to zero extending the add and displacing it by half the integer
   // width. Provided we are careful and make sure our equations are valid over
   // the whole range we can just adjust the input and avoid writing equations
   // for sign extended inputs.
   if (ExtType == ISD::SEXTLOAD)
     AddConstant -= (1 << (width-1));
 
   switch(CC) {
   case AArch64CC::LE:
   case AArch64CC::GT: {
     if ((AddConstant == 0) ||
         (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
         (AddConstant >= 0 && CompConstant < 0) ||
         (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
       return true;
   } break;
   case AArch64CC::LT:
   case AArch64CC::GE: {
     if ((AddConstant == 0) ||
         (AddConstant >= 0 && CompConstant <= 0) ||
         (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
       return true;
   } break;
   case AArch64CC::HI:
   case AArch64CC::LS: {
     if ((AddConstant >= 0 && CompConstant < 0) ||
        (AddConstant <= 0 && CompConstant >= -1 &&
         CompConstant < AddConstant + MaxUInt))
       return true;
   } break;
   case AArch64CC::PL:
   case AArch64CC::MI: {
     if ((AddConstant == 0) ||
         (AddConstant > 0 && CompConstant <= 0) ||
         (AddConstant < 0 && CompConstant <= AddConstant))
       return true;
   } break;
   case AArch64CC::LO:
   case AArch64CC::HS: {
     if ((AddConstant >= 0 && CompConstant <= 0) ||
         (AddConstant <= 0 && CompConstant >= 0 &&
          CompConstant <= AddConstant + MaxUInt))
       return true;
   } break;
   case AArch64CC::EQ:
   case AArch64CC::NE: {
     if ((AddConstant > 0 && CompConstant < 0) ||
         (AddConstant < 0 && CompConstant >= 0 &&
          CompConstant < AddConstant + MaxUInt) ||
         (AddConstant >= 0 && CompConstant >= 0 &&
          CompConstant >= AddConstant) ||
         (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
 
       return true;
   } break;
   case AArch64CC::VS:
   case AArch64CC::VC:
   case AArch64CC::AL:
   case AArch64CC::NV:
     return true;
   case AArch64CC::Invalid:
     break;
   }
 
   return false;
 }
 
 static
 SDValue performCONDCombine(SDNode *N,
                            TargetLowering::DAGCombinerInfo &DCI,
                            SelectionDAG &DAG, unsigned CCIndex,
                            unsigned CmpIndex) {
   unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
   SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
   unsigned CondOpcode = SubsNode->getOpcode();
 
   if (CondOpcode != AArch64ISD::SUBS)
     return SDValue();
 
   // There is a SUBS feeding this condition. Is it fed by a mask we can
   // use?
 
   SDNode *AndNode = SubsNode->getOperand(0).getNode();
   unsigned MaskBits = 0;
 
   if (AndNode->getOpcode() != ISD::AND)
     return SDValue();
 
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
     uint32_t CNV = CN->getZExtValue();
     if (CNV == 255)
       MaskBits = 8;
     else if (CNV == 65535)
       MaskBits = 16;
   }
 
   if (!MaskBits)
     return SDValue();
 
   SDValue AddValue = AndNode->getOperand(0);
 
   if (AddValue.getOpcode() != ISD::ADD)
     return SDValue();
 
   // The basic dag structure is correct, grab the inputs and validate them.
 
   SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
   SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
   SDValue SubsInputValue = SubsNode->getOperand(1);
 
   // The mask is present and the provenance of all the values is a smaller type,
   // lets see if the mask is superfluous.
 
   if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
       !isa<ConstantSDNode>(SubsInputValue.getNode()))
     return SDValue();
 
   ISD::LoadExtType ExtType;
 
   if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
       !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
       !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
     return SDValue();
 
   if(!isEquivalentMaskless(CC, MaskBits, ExtType,
                 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
                 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
     return SDValue();
 
   // The AND is not necessary, remove it.
 
   SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
                                SubsNode->getValueType(1));
   SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
 
   SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
   DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
 
   return SDValue(N, 0);
 }
 
 // Optimize compare with zero and branch.
 static SDValue performBRCONDCombine(SDNode *N,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     SelectionDAG &DAG) {
   SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3);
   if (NV.getNode())
     N = NV.getNode();
   SDValue Chain = N->getOperand(0);
   SDValue Dest = N->getOperand(1);
   SDValue CCVal = N->getOperand(2);
   SDValue Cmp = N->getOperand(3);
 
   assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
   unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
   if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
     return SDValue();
 
   unsigned CmpOpc = Cmp.getOpcode();
   if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
     return SDValue();
 
   // Only attempt folding if there is only one use of the flag and no use of the
   // value.
   if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
     return SDValue();
 
   SDValue LHS = Cmp.getOperand(0);
   SDValue RHS = Cmp.getOperand(1);
 
   assert(LHS.getValueType() == RHS.getValueType() &&
          "Expected the value type to be the same for both operands!");
   if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
     return SDValue();
 
   if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
     std::swap(LHS, RHS);
 
   if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
     return SDValue();
 
   if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
       LHS.getOpcode() == ISD::SRL)
     return SDValue();
 
   // Fold the compare into the branch instruction.
   SDValue BR;
   if (CC == AArch64CC::EQ)
     BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
   else
     BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
 
   // Do not add new nodes to DAG combiner worklist.
   DCI.CombineTo(N, BR, false);
 
   return SDValue();
 }
 
 // vselect (v1i1 setcc) ->
 //     vselect (v1iXX setcc)  (XX is the size of the compared operand type)
 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
 // such VSELECT.
 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   EVT CCVT = N0.getValueType();
 
   if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
       CCVT.getVectorElementType() != MVT::i1)
     return SDValue();
 
   EVT ResVT = N->getValueType(0);
   EVT CmpVT = N0.getOperand(0).getValueType();
   // Only combine when the result type is of the same size as the compared
   // operands.
   if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
     return SDValue();
 
   SDValue IfTrue = N->getOperand(1);
   SDValue IfFalse = N->getOperand(2);
   SDValue SetCC =
       DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
                    N0.getOperand(0), N0.getOperand(1),
                    cast<CondCodeSDNode>(N0.getOperand(2))->get());
   return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
                      IfTrue, IfFalse);
 }
 
 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
 /// the compare-mask instructions rather than going via NZCV, even if LHS and
 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
 /// with a vector one followed by a DUP shuffle on the result.
 static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   EVT ResVT = N->getValueType(0);
 
   if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
     return SDValue();
 
   // If NumMaskElts == 0, the comparison is larger than select result. The
   // largest real NEON comparison is 64-bits per lane, which means the result is
   // at most 32-bits and an illegal vector. Just bail out for now.
   EVT SrcVT = N0.getOperand(0).getValueType();
 
   // Don't try to do this optimization when the setcc itself has i1 operands.
   // There are no legal vectors of i1, so this would be pointless.
   if (SrcVT == MVT::i1)
     return SDValue();
 
   int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
   if (!ResVT.isVector() || NumMaskElts == 0)
     return SDValue();
 
   SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
   EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
 
   // First perform a vector comparison, where lane 0 is the one we're interested
   // in.
   SDLoc DL(N0);
   SDValue LHS =
       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
   SDValue RHS =
       DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
   SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
 
   // Now duplicate the comparison mask we want across all other lanes.
   SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
   SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
   Mask = DAG.getNode(ISD::BITCAST, DL,
                      ResVT.changeVectorElementTypeToInteger(), Mask);
 
   return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
 }
 
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
   default:
     break;
   case ISD::ADD:
   case ISD::SUB:
     return performAddSubLongCombine(N, DCI, DAG);
   case ISD::XOR:
     return performXorCombine(N, DAG, DCI, Subtarget);
   case ISD::MUL:
     return performMulCombine(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
     return performIntToFpCombine(N, DAG, Subtarget);
   case ISD::OR:
     return performORCombine(N, DCI, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN:
     return performIntrinsicCombine(N, DCI, Subtarget);
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:
   case ISD::SIGN_EXTEND:
     return performExtendCombine(N, DCI, DAG);
   case ISD::BITCAST:
     return performBitcastCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
   case ISD::SELECT:
     return performSelectCombine(N, DAG);
   case ISD::VSELECT:
     return performVSelectCombine(N, DCI.DAG);
   case ISD::STORE:
     return performSTORECombine(N, DCI, DAG, Subtarget);
   case AArch64ISD::BRCOND:
     return performBRCONDCombine(N, DCI, DAG);
   case AArch64ISD::CSEL:
     return performCONDCombine(N, DCI, DAG, 2, 3);
   case AArch64ISD::DUP:
     return performPostLD1Combine(N, DCI, false);
   case ISD::INSERT_VECTOR_ELT:
     return performPostLD1Combine(N, DCI, true);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
     case Intrinsic::aarch64_neon_ld2:
     case Intrinsic::aarch64_neon_ld3:
     case Intrinsic::aarch64_neon_ld4:
     case Intrinsic::aarch64_neon_ld1x2:
     case Intrinsic::aarch64_neon_ld1x3:
     case Intrinsic::aarch64_neon_ld1x4:
     case Intrinsic::aarch64_neon_ld2lane:
     case Intrinsic::aarch64_neon_ld3lane:
     case Intrinsic::aarch64_neon_ld4lane:
     case Intrinsic::aarch64_neon_ld2r:
     case Intrinsic::aarch64_neon_ld3r:
     case Intrinsic::aarch64_neon_ld4r:
     case Intrinsic::aarch64_neon_st2:
     case Intrinsic::aarch64_neon_st3:
     case Intrinsic::aarch64_neon_st4:
     case Intrinsic::aarch64_neon_st1x2:
     case Intrinsic::aarch64_neon_st1x3:
     case Intrinsic::aarch64_neon_st1x4:
     case Intrinsic::aarch64_neon_st2lane:
     case Intrinsic::aarch64_neon_st3lane:
     case Intrinsic::aarch64_neon_st4lane:
       return performNEONPostLDSTCombine(N, DCI, DAG);
     default:
       break;
     }
   }
   return SDValue();
 }
 
 // Check if the return value is used as only a return value, as otherwise
 // we can't perform a tail-call. In particular, we need to check for
 // target ISD nodes that are returns and any other "odd" constructs
 // that the generic analysis code won't necessarily catch.
 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
                                                SDValue &Chain) const {
   if (N->getNumValues() != 1)
     return false;
   if (!N->hasNUsesOfValue(1, 0))
     return false;
 
   SDValue TCChain = Chain;
   SDNode *Copy = *N->use_begin();
   if (Copy->getOpcode() == ISD::CopyToReg) {
     // If the copy has a glue operand, we conservatively assume it isn't safe to
     // perform a tail call.
     if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
         MVT::Glue)
       return false;
     TCChain = Copy->getOperand(0);
   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
     return false;
 
   bool HasRet = false;
   for (SDNode *Node : Copy->uses()) {
     if (Node->getOpcode() != AArch64ISD::RET_FLAG)
       return false;
     HasRet = true;
   }
 
   if (!HasRet)
     return false;
 
   Chain = TCChain;
   return true;
 }
 
 // Return whether the an instruction can potentially be optimized to a tail
 // call. This will cause the optimizers to attempt to move, or duplicate,
 // return instructions to help enable tail call optimizations for this
 // instruction.
 bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   if (!CI->isTailCall())
     return false;
 
   return true;
 }
 
 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
                                                    SDValue &Offset,
                                                    ISD::MemIndexedMode &AM,
                                                    bool &IsInc,
                                                    SelectionDAG &DAG) const {
   if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
     return false;
 
   Base = Op->getOperand(0);
   // All of the indexed addressing mode instructions take a signed
   // 9 bit immediate offset.
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
     int64_t RHSC = (int64_t)RHS->getZExtValue();
     if (RHSC >= 256 || RHSC <= -256)
       return false;
     IsInc = (Op->getOpcode() == ISD::ADD);
     Offset = Op->getOperand(1);
     return true;
   }
   return false;
 }
 
 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
                                                       SDValue &Offset,
                                                       ISD::MemIndexedMode &AM,
                                                       SelectionDAG &DAG) const {
   EVT VT;
   SDValue Ptr;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
   } else
     return false;
 
   bool IsInc;
   if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
     return false;
   AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
   return true;
 }
 
 bool AArch64TargetLowering::getPostIndexedAddressParts(
     SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
     ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
   EVT VT;
   SDValue Ptr;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
   } else
     return false;
 
   bool IsInc;
   if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
     return false;
   // Post-indexing updates the base, so it's not a valid transform
   // if that's not the same as the load's pointer.
   if (Ptr != Base)
     return false;
   AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
   return true;
 }
 
 static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                   SelectionDAG &DAG) {
   SDLoc DL(N);
   SDValue Op = N->getOperand(0);
 
   if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
     return;
 
   Op = SDValue(
       DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
                          DAG.getUNDEF(MVT::i32), Op,
                          DAG.getTargetConstant(AArch64::hsub, MVT::i32)),
       0);
   Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
   Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
 }
 
 void AArch64TargetLowering::ReplaceNodeResults(
     SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Don't know how to custom expand this");
   case ISD::BITCAST:
     ReplaceBITCASTResults(N, Results, DAG);
     return;
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:
     assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
     // Let normal code take care of it by not adding anything to Results.
     return;
   }
 }
 
 bool AArch64TargetLowering::useLoadStackGuardNode() const {
   return true;
 }
 
 bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const {
   // Combine multiple FDIVs with the same divisor into multiple FMULs by the
   // reciprocal if there are three or more FDIVs.
   return NumUsers > 2;
 }
 
 TargetLoweringBase::LegalizeTypeAction
 AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
   MVT SVT = VT.getSimpleVT();
   // During type legalization, we prefer to widen v1i8, v1i16, v1i32  to v8i8,
   // v4i16, v2i32 instead of to promote.
   if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
       || SVT == MVT::v1f32)
     return TypeWidenVector;
 
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
 // Loads and stores less than 128-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong.
 bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
   return Size == 128;
 }
 
 // Loads and stores less than 128-bits are already atomic; ones above that
 // are doomed anyway, so defer to the default libcall and blame the OS when
 // things go wrong.
 bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
   unsigned Size = LI->getType()->getPrimitiveSizeInBits();
   return Size == 128;
 }
 
 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
 bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
   return Size <= 128;
 }
 
 bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const {
   return true;
 }
 
 Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                                              AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
   bool IsAcquire = isAtLeastAcquire(Ord);
 
   // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
   // intrinsic must return {i64, i64} and we have to recombine them into a
   // single i128 here.
   if (ValTy->getPrimitiveSizeInBits() == 128) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
     Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
 
     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
     Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
 
     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
     Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
     Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
     return Builder.CreateOr(
         Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
   }
 
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int =
       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
   Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
 
   return Builder.CreateTruncOrBitCast(
       Builder.CreateCall(Ldxr, Addr),
       cast<PointerType>(Addr->getType())->getElementType());
 }
 
 Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
                                                    Value *Val, Value *Addr,
                                                    AtomicOrdering Ord) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   bool IsRelease = isAtLeastRelease(Ord);
 
   // Since the intrinsics must have legal type, the i128 intrinsics take two
   // parameters: "i64, i64". We must marshal Val into the appropriate form
   // before the call.
   if (Val->getType()->getPrimitiveSizeInBits() == 128) {
     Intrinsic::ID Int =
         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
     Function *Stxr = Intrinsic::getDeclaration(M, Int);
     Type *Int64Ty = Type::getInt64Ty(M->getContext());
 
     Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
     Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
     return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
   }
 
   Intrinsic::ID Int =
       IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
   Type *Tys[] = { Addr->getType() };
   Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
 
   return Builder.CreateCall2(
       Stxr, Builder.CreateZExtOrBitCast(
                 Val, Stxr->getFunctionType()->getParamType(0)),
       Addr);
 }
 
 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
     Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
   return Ty->isArrayTy();
 }
Index: head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h	(revision 280864)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.h	(revision 280865)
@@ -1,489 +1,489 @@
 //==-- AArch64ISelLowering.h - AArch64 DAG Lowering Interface ----*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the interfaces that AArch64 uses to lower LLVM code into a
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64ISELLOWERING_H
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
 
 namespace AArch64ISD {
 
 enum {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
   WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
   CALL,         // Function call.
 
-  // Almost the same as a normal call node, except that a TLSDesc relocation is
-  // needed so the linker can relax it correctly if possible.
-  TLSDESC_CALL,
+  // Produces the full sequence of instructions for getting the thread pointer
+  // offset of a variable into X0, using the TLSDesc model.
+  TLSDESC_CALLSEQ,
   ADRP,     // Page address of a TargetGlobalAddress operand.
   ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
   LOADgot,  // Load from automatically generated descriptor (e.g. Global
             // Offset Table, TLS record).
   RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
   BRCOND,   // Conditional branch instruction; "b.cond".
   CSEL,
   FCSEL, // Conditional move instruction.
   CSINV, // Conditional select invert.
   CSNEG, // Conditional select negate.
   CSINC, // Conditional select increment.
 
   // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
   // ELF.
   THREAD_POINTER,
   ADC,
   SBC, // adc, sbc instructions
 
   // Arithmetic instructions which write flags.
   ADDS,
   SUBS,
   ADCS,
   SBCS,
   ANDS,
 
   // Floating point comparison
   FCMP,
 
   // Floating point max and min instructions.
   FMAX,
   FMIN,
 
   // Scalar extract
   EXTR,
 
   // Scalar-to-vector duplication
   DUP,
   DUPLANE8,
   DUPLANE16,
   DUPLANE32,
   DUPLANE64,
 
   // Vector immedate moves
   MOVI,
   MOVIshift,
   MOVIedit,
   MOVImsl,
   FMOV,
   MVNIshift,
   MVNImsl,
 
   // Vector immediate ops
   BICi,
   ORRi,
 
   // Vector bit select: similar to ISD::VSELECT but not all bits within an
   // element must be identical.
   BSL,
 
   // Vector arithmetic negation
   NEG,
 
   // Vector shuffles
   ZIP1,
   ZIP2,
   UZP1,
   UZP2,
   TRN1,
   TRN2,
   REV16,
   REV32,
   REV64,
   EXT,
 
   // Vector shift by scalar
   VSHL,
   VLSHR,
   VASHR,
 
   // Vector shift by scalar (again)
   SQSHL_I,
   UQSHL_I,
   SQSHLU_I,
   SRSHR_I,
   URSHR_I,
 
   // Vector comparisons
   CMEQ,
   CMGE,
   CMGT,
   CMHI,
   CMHS,
   FCMEQ,
   FCMGE,
   FCMGT,
 
   // Vector zero comparisons
   CMEQz,
   CMGEz,
   CMGTz,
   CMLEz,
   CMLTz,
   FCMEQz,
   FCMGEz,
   FCMGTz,
   FCMLEz,
   FCMLTz,
 
   // Vector bitwise negation
   NOT,
 
   // Vector bitwise selection
   BIT,
 
   // Compare-and-branch
   CBZ,
   CBNZ,
   TBZ,
   TBNZ,
 
   // Tail calls
   TC_RETURN,
 
   // Custom prefetch handling
   PREFETCH,
 
   // {s|u}int to FP within a FP register.
   SITOF,
   UITOF,
 
   /// Natural vector cast. ISD::BITCAST is not natural in the big-endian
   /// world w.r.t vectors; which causes additional REV instructions to be
   /// generated to compensate for the byte-swapping. But sometimes we do
   /// need to re-interpret the data in SIMD vector registers in big-endian
   /// mode without emitting such REV instructions.
   NVCAST,
 
   SMULL,
   UMULL,
 
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LD3post,
   LD4post,
   ST2post,
   ST3post,
   ST4post,
   LD1x2post,
   LD1x3post,
   LD1x4post,
   ST1x2post,
   ST1x3post,
   ST1x4post,
   LD1DUPpost,
   LD2DUPpost,
   LD3DUPpost,
   LD4DUPpost,
   LD1LANEpost,
   LD2LANEpost,
   LD3LANEpost,
   LD4LANEpost,
   ST2LANEpost,
   ST3LANEpost,
   ST4LANEpost
 };
 
 } // end namespace AArch64ISD
 
 class AArch64Subtarget;
 class AArch64TargetMachine;
 
 class AArch64TargetLowering : public TargetLowering {
   bool RequireStrictAlign;
 
 public:
   explicit AArch64TargetLowering(const TargetMachine &TM);
 
   /// Selects the correct CCAssignFn for a given CallingConvention value.
   CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
 
   /// computeKnownBitsForTargetNode - Determine which of the bits specified in
   /// Mask are known to be either zero or one and return them in the
   /// KnownZero/KnownOne bitsets.
   void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
                                      APInt &KnownOne, const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
   MVT getScalarShiftAmountTy(EVT LHSTy) const override;
 
   /// allowsMisalignedMemoryAccesses - Returns true if the target allows
   /// unaligned memory accesses. of the specified type.
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
                                       unsigned Align = 1,
                                       bool *Fast = nullptr) const override {
     if (RequireStrictAlign)
       return false;
     // FIXME: True for Cyclone, but not necessary others.
     if (Fast)
       *Fast = true;
     return true;
   }
 
   /// LowerOperation - Provide custom lowering hooks for some operations.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   const char *getTargetNodeName(unsigned Opcode) const override;
 
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
   /// getFunctionAlignment - Return the Log2 alignment of this function.
   unsigned getFunctionAlignment(const Function *F) const;
 
   /// getMaximalGlobalOffset - Returns the maximal possible offset which can
   /// be used for loads / stores from the global.
   unsigned getMaximalGlobalOffset() const override;
 
   /// Returns true if a cast between SrcAS and DestAS is a noop.
   bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
     // Addrspacecasts are always noops.
     return true;
   }
 
   /// createFastISel - This method returns a target specific FastISel object,
   /// or null if the target does not support "fast" ISel.
   FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                            const TargetLibraryInfo *libInfo) const override;
 
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
   bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
   /// isShuffleMaskLegal - Return true if the given shuffle mask can be
   /// codegen'd directly, or if it should be stack expanded.
   bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
 
   /// getSetCCResultType - Return the ISD::SETCC ValueType
   EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
   SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
   MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
                                   MachineBasicBlock *BB) const;
 
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr *MI,
                               MachineBasicBlock *MBB) const override;
 
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
                           unsigned Intrinsic) const override;
 
   bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
   bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
   bool isZExtFree(Type *Ty1, Type *Ty2) const override;
   bool isZExtFree(EVT VT1, EVT VT2) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
 
   bool hasPairedLoad(Type *LoadedType,
                      unsigned &RequiredAligment) const override;
   bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
   EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
                           bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                           MachineFunction &MF) const override;
 
   /// isLegalAddressingMode - Return true if the addressing mode represented
   /// by AM is legal for this target, for a load/store of the specified type.
   bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
   /// \brief Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.
   /// If the AM is supported, the return value must be >= 0.
   /// If the AM is not supported, it returns a negative value.
   int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
 
   /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
   /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
   /// expanded to FMAs when this method returns true, otherwise fmuladd is
   /// expanded to fmul + fadd.
   bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
   const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
   /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
   bool isDesirableToCommuteWithShift(const SDNode *N) const override;
 
   /// \brief Returns true if it is beneficial to convert a load of a constant
   /// to just the constant itself.
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override;
 
   bool hasLoadLinkedStoreConditional() const override;
   Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
                               Value *Addr, AtomicOrdering Ord) const override;
 
   bool shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
   bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
   bool shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
 
   bool useLoadStackGuardNode() const override;
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(EVT VT) const override;
 
 private:
   /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
 
   void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
   void addDRTypeForNEON(MVT VT);
   void addQRTypeForNEON(MVT VT);
 
   SDValue
   LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                        const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
                        SelectionDAG &DAG,
                        SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerCall(CallLoweringInfo & /*CLI*/,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
   SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                           CallingConv::ID CallConv, bool isVarArg,
                           const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
                           SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
                           bool isThisReturn, SDValue ThisVal) const;
 
   bool isEligibleForTailCallOptimization(
       SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
       bool isCalleeStructRet, bool isCallerStructRet,
       const SmallVectorImpl<ISD::OutputArg> &Outs,
       const SmallVectorImpl<SDValue> &OutVals,
       const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
 
   /// Finds the incoming stack arguments which overlap the given fixed stack
   /// object and incorporates their load into the current chain. This prevents
   /// an upcoming store from clobbering the stack argument before it's used.
   SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
                               MachineFrameInfo *MFI, int ClobberedFI) const;
 
   bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
 
   bool IsTailCallConvention(CallingConv::ID CallCC) const;
 
   void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
                            SDValue &Chain) const;
 
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       LLVMContext &Context) const override;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
                       SelectionDAG &DAG) const override;
 
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
-                              SelectionDAG &DAG) const;
+  SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, SDLoc DL,
+                                 SelectionDAG &DAG) const;
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
                         RTLIB::Libcall Call) const;
   SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         std::vector<SDNode *> *Created) const override;
   bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
 
   ConstraintType
   getConstraintType(const std::string &Constraint) const override;
   unsigned getRegisterByName(const char* RegName, EVT VT) const override;
 
   /// Examine constraint string and operand type and determine a weight value.
   /// The operand object must already have been set up with the operand type.
   ConstraintWeight
   getSingleConstraintMatchWeight(AsmOperandInfo &info,
                                  const char *constraint) const override;
 
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const std::string &Constraint,
                                MVT VT) const override;
   void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                                     std::vector<SDValue> &Ops,
                                     SelectionDAG &DAG) const override;
 
   bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
   bool mayBeEmittedAsTailCall(CallInst *CI) const override;
   bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
                               ISD::MemIndexedMode &AM, bool &IsInc,
                               SelectionDAG &DAG) const;
   bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
                                  ISD::MemIndexedMode &AM,
                                  SelectionDAG &DAG) const override;
   bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
                                   SDValue &Offset, ISD::MemIndexedMode &AM,
                                   SelectionDAG &DAG) const override;
 
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
 
   bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
                                                  CallingConv::ID CallConv,
                                                  bool isVarArg) const override;
 };
 
 namespace AArch64 {
 FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                          const TargetLibraryInfo *libInfo);
 } // end namespace AArch64
 
 } // end namespace llvm
 
 #endif
Index: head/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td	(revision 280864)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td	(revision 280865)
@@ -1,5682 +1,5697 @@
 //=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // AArch64 Instruction definitions.
 //
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
                                AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "neon">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto", "crypto">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsCyclone        : Predicate<"Subtarget->isCyclone()">;
 
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
 //
 
 // SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
 def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
                                               [SDTCisSameAs<0, 2>,
                                                SDTCisSameAs<0, 3>,
                                                SDTCisInt<0>, SDTCisVT<1, i32>]>;
 
 // SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
 def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
                                             [SDTCisSameAs<0, 1>,
                                              SDTCisSameAs<0, 2>,
                                              SDTCisInt<0>,
                                              SDTCisVT<3, i32>]>;
 
 // SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
 def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
                                             [SDTCisSameAs<0, 2>,
                                              SDTCisSameAs<0, 3>,
                                              SDTCisInt<0>,
                                              SDTCisVT<1, i32>,
                                              SDTCisVT<4, i32>]>;
 
 def SDT_AArch64Brcond  : SDTypeProfile<0, 3,
                                      [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
                                       SDTCisVT<2, i32>]>;
 def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
 def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
                                         SDTCisVT<2, OtherVT>]>;
 
 
 def SDT_AArch64CSel  : SDTypeProfile<1, 4,
                                    [SDTCisSameAs<0, 1>,
                                     SDTCisSameAs<0, 2>,
                                     SDTCisInt<3>,
                                     SDTCisVT<4, i32>]>;
 def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
                                    [SDTCisFP<0>,
                                     SDTCisSameAs<0, 1>]>;
 def SDT_AArch64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
 def SDT_AArch64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
 def SDT_AArch64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                           SDTCisSameAs<0, 1>,
                                           SDTCisSameAs<0, 2>]>;
 def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
 def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
 def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                            SDTCisInt<2>, SDTCisInt<3>]>;
 def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                           SDTCisSameAs<0,2>, SDTCisInt<3>]>;
 def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
 
 def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
 def SDT_AArch64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
 def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                            SDTCisSameAs<0,2>]>;
 def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                            SDTCisSameAs<0,2>,
                                            SDTCisSameAs<0,3>]>;
 def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
 def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
 
 def SDT_AArch64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
 
 def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
                                                  SDTCisPtrTy<1>]>;
+
+// Generates the general dynamic sequences, i.e.
+//  adrp  x0, :tlsdesc:var
+//  ldr   x1, [x0, #:tlsdesc_lo12:var]
+//  add   x0, x0, #:tlsdesc_lo12:var
+//  .tlsdesccall var
+//  blr   x1
+
+// (the TPIDR_EL0 offset is put directly in X0, hence no "result" here)
+// number of operands (the variable)
+def SDT_AArch64TLSDescCallSeq : SDTypeProfile<0,1,
+                                          [SDTCisPtrTy<0>]>;
+
 def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
                                         [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
                                          SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
                                          SDTCisSameAs<1, 4>]>;
 
 
 // Node definitions.
 def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
 def AArch64addlow        : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
 def AArch64LOADgot       : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
 def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
                                 SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
                                 [SDNPHasChain, SDNPOutGlue]>;
 def AArch64callseq_end   : SDNode<"ISD::CALLSEQ_END",
                                 SDCallSeqEnd<[ SDTCisVT<0, i32>,
                                                SDTCisVT<1, i32> ]>,
                                 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def AArch64call          : SDNode<"AArch64ISD::CALL",
                                 SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
                                 [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                                  SDNPVariadic]>;
 def AArch64brcond        : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
                                 [SDNPHasChain]>;
 def AArch64cbz           : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
                                 [SDNPHasChain]>;
 def AArch64cbnz           : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
                                 [SDNPHasChain]>;
 def AArch64tbz           : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
                                 [SDNPHasChain]>;
 def AArch64tbnz           : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
                                 [SDNPHasChain]>;
 
 
 def AArch64csel          : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
 def AArch64csinv         : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
 def AArch64csneg         : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
 def AArch64csinc         : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
 def AArch64retflag       : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
                                 [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def AArch64adc       : SDNode<"AArch64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
 def AArch64sbc       : SDNode<"AArch64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
 def AArch64add_flag  : SDNode<"AArch64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
                             [SDNPCommutative]>;
 def AArch64sub_flag  : SDNode<"AArch64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
 def AArch64and_flag  : SDNode<"AArch64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
                             [SDNPCommutative]>;
 def AArch64adc_flag  : SDNode<"AArch64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
 def AArch64sbc_flag  : SDNode<"AArch64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
 
 def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
 
 def AArch64fcmp      : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
 
 def AArch64fmax      : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>;
 def AArch64fmin      : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>;
 
 def AArch64dup       : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
 def AArch64duplane8  : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
 def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
 def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
 def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
 
 def AArch64zip1      : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
 def AArch64zip2      : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
 def AArch64uzp1      : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
 def AArch64uzp2      : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
 def AArch64trn1      : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
 def AArch64trn2      : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
 
 def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
 def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
 def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
 def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
 def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
 def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
 def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
 
 def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
 def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
 def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
 def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
 
 def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
 def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
 def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
 def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
 def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
 def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
 def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
 def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
 
 def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
 def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
 def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
 
 def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
 def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
 def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
 def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
 def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;
 
 def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
 def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
 def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;
 
 def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
 def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
 def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
 def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
 def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
 def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
                         (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
 
 def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
 def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
 def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
 def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
 def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
 
 def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
 def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
 
 def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
 
 def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
                   [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
 def AArch64Prefetch        : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
                                [SDNPHasChain, SDNPSideEffect]>;
 
 def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
 def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
 
-def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL",
-                                 SDT_AArch64TLSDescCall,
-                                 [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-                                  SDNPVariadic]>;
+def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
+                                    SDT_AArch64TLSDescCallSeq,
+                                    [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+                                     SDNPVariadic]>;
 
+
 def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
                                  SDT_AArch64WrapperLarge>;
 
 def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;
 
 def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
                                     SDTCisSameAs<1, 2>]>;
 def AArch64smull    : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
 def AArch64umull    : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;
 
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 
 // AArch64 Instruction Predicate Definitions.
 //
 def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
 def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
 def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
 def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
 def ForCodeSize   : Predicate<"ForCodeSize">;
 def NotForCodeSize   : Predicate<"!ForCodeSize">;
 
 include "AArch64InstrFormats.td"
 
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // Miscellaneous instructions.
 //===----------------------------------------------------------------------===//
 
 let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
                               [(AArch64callseq_start timm:$amt)]>;
 def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
                             [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
 } // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
 
 let isReMaterializable = 1, isCodeGenOnly = 1 in {
 // FIXME: The following pseudo instructions are only needed because remat
 // cannot handle multiple instructions.  When that changes, they can be
 // removed, along with the AArch64Wrapper node.
 
 let AddedComplexity = 10 in
 def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
                      [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
               Sched<[WriteLDAdr]>;
 
 // The MOVaddr instruction should match only when the add is not folded
 // into a load or store address.
 def MOVaddr
     : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
              [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
                                             tglobaladdr:$low))]>,
       Sched<[WriteAdrAdr]>;
 def MOVaddrJT
     : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
              [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
                                              tjumptable:$low))]>,
       Sched<[WriteAdrAdr]>;
 def MOVaddrCP
     : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
              [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
                                              tconstpool:$low))]>,
       Sched<[WriteAdrAdr]>;
 def MOVaddrBA
     : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
              [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
                                              tblockaddress:$low))]>,
       Sched<[WriteAdrAdr]>;
 def MOVaddrTLS
     : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
              [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
                                             tglobaltlsaddr:$low))]>,
       Sched<[WriteAdrAdr]>;
 def MOVaddrEXT
     : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
              [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
                                             texternalsym:$low))]>,
       Sched<[WriteAdrAdr]>;
 
 } // isReMaterializable, isCodeGenOnly
 
 def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
           (LOADgot tglobaltlsaddr:$addr)>;
 
 def : Pat<(AArch64LOADgot texternalsym:$addr),
           (LOADgot texternalsym:$addr)>;
 
 def : Pat<(AArch64LOADgot tconstpool:$addr),
           (LOADgot tconstpool:$addr)>;
 
 //===----------------------------------------------------------------------===//
 // System instructions.
 //===----------------------------------------------------------------------===//
 
 def HINT : HintI<"hint">;
 def : InstAlias<"nop",  (HINT 0b000)>;
 def : InstAlias<"yield",(HINT 0b001)>;
 def : InstAlias<"wfe",  (HINT 0b010)>;
 def : InstAlias<"wfi",  (HINT 0b011)>;
 def : InstAlias<"sev",  (HINT 0b100)>;
 def : InstAlias<"sevl", (HINT 0b101)>;
 
 // As far as LLVM is concerned this writes to the system's exclusive monitors.
 let mayLoad = 1, mayStore = 1 in
 def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
 
 // NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
 // model patterns with sufficiently fine granularity.
 let mayLoad = ?, mayStore = ? in {
 def DMB   : CRmSystemI<barrier_op, 0b101, "dmb",
                        [(int_aarch64_dmb (i32 imm32_0_15:$CRm))]>;
 
 def DSB   : CRmSystemI<barrier_op, 0b100, "dsb",
                        [(int_aarch64_dsb (i32 imm32_0_15:$CRm))]>;
 
 def ISB   : CRmSystemI<barrier_op, 0b110, "isb",
                        [(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;
 }
 
 def : InstAlias<"clrex", (CLREX 0xf)>;
 def : InstAlias<"isb", (ISB 0xf)>;
 
 def MRS    : MRSI;
 def MSR    : MSRI;
 def MSRpstate: MSRpstateI;
 
 // The thread pointer (on Linux, at least, where this has been implemented) is
 // TPIDR_EL0.
 def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
 
 // Generic system instructions
 def SYSxt  : SystemXtI<0, "sys">;
 def SYSLxt : SystemLXtI<1, "sysl">;
 
 def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
                 (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
                  sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
 
 //===----------------------------------------------------------------------===//
 // Move immediate instructions.
 //===----------------------------------------------------------------------===//
 
 defm MOVK : InsertImmediate<0b11, "movk">;
 defm MOVN : MoveImmediate<0b00, "movn">;
 
 let PostEncoderMethod = "fixMOVZ" in
 defm MOVZ : MoveImmediate<0b10, "movz">;
 
 // First group of aliases covers an implicit "lsl #0".
 def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
 def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
 def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
 def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
 def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
 def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
 
 // Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
 def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
 def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
 def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
 def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
 
 def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
 def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
 def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
 def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
 
 def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
 def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
 def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
 def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
 
 def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
 def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
 
 def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
 def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
 
 def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
 def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
 
 // Final group of aliases covers true "mov $Rd, $imm" cases.
 multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
                           int width, int shift> {
   def _asmoperand : AsmOperandClass {
     let Name = basename # width # "_lsl" # shift # "MovAlias";
     let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
                                # shift # ">";
     let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
   }
 
   def _movimm : Operand<i32> {
     let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
   }
 
   def : InstAlias<"mov $Rd, $imm",
                   (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
 }
 
 defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
 defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
 
 defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
 defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
 defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
 defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
 
 defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
 defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
 
 defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
 defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
 defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
 defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
 
 let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
     isAsCheapAsAMove = 1 in {
 // FIXME: The following pseudo instructions are only needed because remat
 // cannot handle multiple instructions.  When that changes, we can select
 // directly to the real instructions and get rid of these pseudos.
 
 def MOVi32imm
     : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
              [(set GPR32:$dst, imm:$src)]>,
       Sched<[WriteImm]>;
 def MOVi64imm
     : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
              [(set GPR64:$dst, imm:$src)]>,
       Sched<[WriteImm]>;
 } // isReMaterializable, isCodeGenOnly
 
 // If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
 // eventual expansion code fewer bits to worry about getting right. Marshalling
 // the types is a little tricky though:
 def i64imm_32bit : ImmLeaf<i64, [{
   return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
 }]>;
 
 def trunc_imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i32);
 }]>;
 
 def : Pat<(i64 i64imm_32bit:$src),
           (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
 
 // Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
 def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
 return CurDAG->getTargetConstant(
   N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i32);
 }]>;
 
 def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
 return CurDAG->getTargetConstant(
   N->getValueAPF().bitcastToAPInt().getZExtValue(), MVT::i64);
 }]>;
 
 
 def : Pat<(f32 fpimm:$in),
   (COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>;
 def : Pat<(f64 fpimm:$in),
   (COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>;
 
 
 // Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
 // sequences.
 def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
                              tglobaladdr:$g1, tglobaladdr:$g0),
           (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
                                   tglobaladdr:$g2, 32),
                           tglobaladdr:$g1, 16),
                   tglobaladdr:$g0, 0)>;
 
 def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
                              tblockaddress:$g1, tblockaddress:$g0),
           (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
                                   tblockaddress:$g2, 32),
                           tblockaddress:$g1, 16),
                   tblockaddress:$g0, 0)>;
 
 def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
                              tconstpool:$g1, tconstpool:$g0),
           (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
                                   tconstpool:$g2, 32),
                           tconstpool:$g1, 16),
                   tconstpool:$g0, 0)>;
 
 def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
                              tjumptable:$g1, tjumptable:$g0),
           (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
                                   tjumptable:$g2, 32),
                           tjumptable:$g1, 16),
                   tjumptable:$g0, 0)>;
 
 
 //===----------------------------------------------------------------------===//
 // Arithmetic instructions.
 //===----------------------------------------------------------------------===//
 
 // Add/subtract with carry.
 defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
 defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;
 
 def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
 def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
 def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
 def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
 
 // Add/subtract
 defm ADD : AddSub<0, "add", add>;
 defm SUB : AddSub<1, "sub">;
 
 def : InstAlias<"mov $dst, $src",
                 (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
 def : InstAlias<"mov $dst, $src",
                 (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
 def : InstAlias<"mov $dst, $src",
                 (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
 def : InstAlias<"mov $dst, $src",
                 (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
 
 defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn">;
 defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp">;
 
 // Use SUBS instead of SUB to enable CSE between SUBS and SUB.
 def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
           (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
 def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
           (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
 def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
           (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
 def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
           (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
 def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
           (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
 def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
           (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
 def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
           (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
 def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
           (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
 
 // Because of the immediate format for add/sub-imm instructions, the
 // expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
 //  These patterns capture that transformation.
 let AddedComplexity = 1 in {
 def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
           (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
 def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
           (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
 def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
           (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
 def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
           (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
 }
 
 // Because of the immediate format for add/sub-imm instructions, the
 // expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
 //  These patterns capture that transformation.
 let AddedComplexity = 1 in {
 def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
           (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
 def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
           (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
 def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
           (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
 def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
           (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
 }
 
 def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
 def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
 def : InstAlias<"neg $dst, $src$shift",
                 (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
 def : InstAlias<"neg $dst, $src$shift",
                 (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
 
 def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
 def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
 def : InstAlias<"negs $dst, $src$shift",
                 (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
 def : InstAlias<"negs $dst, $src$shift",
                 (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
 
 
 // Unsigned/Signed divide
 defm UDIV : Div<0, "udiv", udiv>;
 defm SDIV : Div<1, "sdiv", sdiv>;
 let isCodeGenOnly = 1 in {
 defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
 defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
 }
 
 // Variable shift
 defm ASRV : Shift<0b10, "asr", sra>;
 defm LSLV : Shift<0b00, "lsl", shl>;
 defm LSRV : Shift<0b01, "lsr", srl>;
 defm RORV : Shift<0b11, "ror", rotr>;
 
 def : ShiftAlias<"asrv", ASRVWr, GPR32>;
 def : ShiftAlias<"asrv", ASRVXr, GPR64>;
 def : ShiftAlias<"lslv", LSLVWr, GPR32>;
 def : ShiftAlias<"lslv", LSLVXr, GPR64>;
 def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
 def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
 def : ShiftAlias<"rorv", RORVWr, GPR32>;
 def : ShiftAlias<"rorv", RORVXr, GPR64>;
 
 // Multiply-add
 let AddedComplexity = 7 in {
 defm MADD : MulAccum<0, "madd", add>;
 defm MSUB : MulAccum<1, "msub", sub>;
 
 def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
           (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
 def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
           (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
 
 def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
           (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
 def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
           (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
 def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
           (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
 def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
           (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
 } // AddedComplexity = 7
 
 let AddedComplexity = 5 in {
 def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
 def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
 def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
 def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
 
 def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
           (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
 def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
           (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
 
 def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
           (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
 def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
           (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
 } // AddedComplexity = 5
 
 def : MulAccumWAlias<"mul", MADDWrrr>;
 def : MulAccumXAlias<"mul", MADDXrrr>;
 def : MulAccumWAlias<"mneg", MSUBWrrr>;
 def : MulAccumXAlias<"mneg", MSUBXrrr>;
 def : WideMulAccumAlias<"smull", SMADDLrrr>;
 def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
 def : WideMulAccumAlias<"umull", UMADDLrrr>;
 def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
 
 // Multiply-high
 def SMULHrr : MulHi<0b010, "smulh", mulhs>;
 def UMULHrr : MulHi<0b110, "umulh", mulhu>;
 
 // CRC32
 def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
 def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
 def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
 def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;
 
 def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
 def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
 def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
 def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
 
 
 //===----------------------------------------------------------------------===//
 // Logical instructions.
 //===----------------------------------------------------------------------===//
 
 // (immediate)
 defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">;
 defm AND  : LogicalImm<0b00, "and", and, "bic">;
 defm EOR  : LogicalImm<0b10, "eor", xor, "eon">;
 defm ORR  : LogicalImm<0b01, "orr", or, "orn">;
 
 // FIXME: these aliases *are* canonical sometimes (when movz can't be
 // used). Actually, it seems to be working right now, but putting logical_immXX
 // here is a bit dodgy on the AsmParser side too.
 def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
                                           logical_imm32:$imm), 0>;
 def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
                                           logical_imm64:$imm), 0>;
 
 
 // (register)
 defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
 defm BICS : LogicalRegS<0b11, 1, "bics",
                         BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
 defm AND  : LogicalReg<0b00, 0, "and", and>;
 defm BIC  : LogicalReg<0b00, 1, "bic",
                        BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
 defm EON  : LogicalReg<0b10, 1, "eon",
                        BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
 defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
 defm ORN  : LogicalReg<0b01, 1, "orn",
                        BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
 defm ORR  : LogicalReg<0b01, 0, "orr", or>;
 
 def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
 def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
 
 def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
 def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
 
 def : InstAlias<"mvn $Wd, $Wm$sh",
                 (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
 def : InstAlias<"mvn $Xd, $Xm$sh",
                 (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
 
 def : InstAlias<"tst $src1, $src2",
                 (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
 def : InstAlias<"tst $src1, $src2",
                 (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
 
 def : InstAlias<"tst $src1, $src2",
                         (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
 def : InstAlias<"tst $src1, $src2",
                         (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
 
 def : InstAlias<"tst $src1, $src2$sh",
                (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
 def : InstAlias<"tst $src1, $src2$sh",
                (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
 
 
 def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
 def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
 
 
 //===----------------------------------------------------------------------===//
 // One operand data processing instructions.
 //===----------------------------------------------------------------------===//
 
 defm CLS    : OneOperandData<0b101, "cls">;
 defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
 defm RBIT   : OneOperandData<0b000, "rbit">;
 
 def : Pat<(int_aarch64_rbit GPR32:$Rn), (RBITWr $Rn)>;
 def : Pat<(int_aarch64_rbit GPR64:$Rn), (RBITXr $Rn)>;
 
 def  REV16Wr : OneWRegData<0b001, "rev16",
                                   UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
 def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
 
 def : Pat<(cttz GPR32:$Rn),
           (CLZWr (RBITWr GPR32:$Rn))>;
 def : Pat<(cttz GPR64:$Rn),
           (CLZXr (RBITXr GPR64:$Rn))>;
 def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
                 (i32 1))),
           (CLSWr GPR32:$Rn)>;
 def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
                 (i64 1))),
           (CLSXr GPR64:$Rn)>;
 
 // Unlike the other one operand instructions, the instructions with the "rev"
 // mnemonic do *not* just different in the size bit, but actually use different
 // opcode bits for the different sizes.
 def REVWr   : OneWRegData<0b010, "rev", bswap>;
 def REVXr   : OneXRegData<0b011, "rev", bswap>;
 def REV32Xr : OneXRegData<0b010, "rev32",
                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
 
 // The bswap commutes with the rotr so we want a pattern for both possible
 // orders.
 def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
 def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
 
 //===----------------------------------------------------------------------===//
 // Bitfield immediate extraction instruction.
 //===----------------------------------------------------------------------===//
 let hasSideEffects = 0 in
 defm EXTR : ExtractImm<"extr">;
 def : InstAlias<"ror $dst, $src, $shift",
             (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
 def : InstAlias<"ror $dst, $src, $shift",
             (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
 
 def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
           (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
 def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
           (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
 
 //===----------------------------------------------------------------------===//
 // Other bitfield immediate instructions.
 //===----------------------------------------------------------------------===//
 let hasSideEffects = 0 in {
 defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
 defm SBFM : BitfieldImm<0b00, "sbfm">;
 defm UBFM : BitfieldImm<0b10, "ubfm">;
 }
 
 def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
   uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
   return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
 def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
   uint64_t enc = 31 - N->getZExtValue();
   return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
 // min(7, 31 - shift_amt)
 def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
   uint64_t enc = 31 - N->getZExtValue();
   enc = enc > 7 ? 7 : enc;
   return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
 // min(15, 31 - shift_amt)
 def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
   uint64_t enc = 31 - N->getZExtValue();
   enc = enc > 15 ? 15 : enc;
   return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
 def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
   uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
   return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
 def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
   uint64_t enc = 63 - N->getZExtValue();
   return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
 // min(7, 63 - shift_amt)
 def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
   uint64_t enc = 63 - N->getZExtValue();
   enc = enc > 7 ? 7 : enc;
   return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
 // min(15, 63 - shift_amt)
 def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
   uint64_t enc = 63 - N->getZExtValue();
   enc = enc > 15 ? 15 : enc;
   return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
 // min(31, 63 - shift_amt)
 def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
   uint64_t enc = 63 - N->getZExtValue();
   enc = enc > 31 ? 31 : enc;
   return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
 def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
           (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
                               (i64 (i32shift_b imm0_31:$imm)))>;
 def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
           (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
                               (i64 (i64shift_b imm0_63:$imm)))>;
 
 let AddedComplexity = 10 in {
 def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
           (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
 def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
           (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
 }
 
 def : InstAlias<"asr $dst, $src, $shift",
                 (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
 def : InstAlias<"asr $dst, $src, $shift",
                 (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
 def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
 def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
 def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
 def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
 def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
 
 def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
           (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
 def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
           (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
 
 def : InstAlias<"lsr $dst, $src, $shift",
                 (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
 def : InstAlias<"lsr $dst, $src, $shift",
                 (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
 def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
 def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
 def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
 def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
 def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
 
 //===----------------------------------------------------------------------===//
 // Conditionally set flags instructions.
 //===----------------------------------------------------------------------===//
 defm CCMN : CondSetFlagsImm<0, "ccmn">;
 defm CCMP : CondSetFlagsImm<1, "ccmp">;
 
 defm CCMN : CondSetFlagsReg<0, "ccmn">;
 defm CCMP : CondSetFlagsReg<1, "ccmp">;
 
 //===----------------------------------------------------------------------===//
 // Conditional select instructions.
 //===----------------------------------------------------------------------===//
 defm CSEL  : CondSelect<0, 0b00, "csel">;
 
 def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
 defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
 defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
 defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
 
 def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
           (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
 def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
           (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
 def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
           (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
 def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
           (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
 def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
           (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
 def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
           (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
 
 def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
           (CSINCWr WZR, WZR, (i32 imm:$cc))>;
 def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
           (CSINCXr XZR, XZR, (i32 imm:$cc))>;
 def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
           (CSINVWr WZR, WZR, (i32 imm:$cc))>;
 def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
           (CSINVXr XZR, XZR, (i32 imm:$cc))>;
 
 // The inverse of the condition code from the alias instruction is what is used
 // in the aliased instruction. The parser all ready inverts the condition code
 // for these aliases.
 def : InstAlias<"cset $dst, $cc",
                 (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
 def : InstAlias<"cset $dst, $cc",
                 (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
 
 def : InstAlias<"csetm $dst, $cc",
                 (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
 def : InstAlias<"csetm $dst, $cc",
                 (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
 
 def : InstAlias<"cinc $dst, $src, $cc",
                 (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
 def : InstAlias<"cinc $dst, $src, $cc",
                 (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
 
 def : InstAlias<"cinv $dst, $src, $cc",
                 (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
 def : InstAlias<"cinv $dst, $src, $cc",
                 (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
 
 def : InstAlias<"cneg $dst, $src, $cc",
                 (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
 def : InstAlias<"cneg $dst, $src, $cc",
                 (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
 
 //===----------------------------------------------------------------------===//
 // PC-relative instructions.
 //===----------------------------------------------------------------------===//
 let isReMaterializable = 1 in {
 let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
 def ADR  : ADRI<0, "adr", adrlabel, []>;
 } // hasSideEffects = 0
 
 def ADRP : ADRI<1, "adrp", adrplabel,
                 [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
 } // isReMaterializable = 1
 
 // page address of a constant pool entry, block address
 def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
 def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
 
 //===----------------------------------------------------------------------===//
 // Unconditional branch (register) instructions.
 //===----------------------------------------------------------------------===//
 
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
 def RET  : BranchReg<0b0010, "ret", []>;
 def DRPS : SpecialReturn<0b0101, "drps">;
 def ERET : SpecialReturn<0b0100, "eret">;
 } // isReturn = 1, isTerminator = 1, isBarrier = 1
 
 // Default to the LR register.
 def : InstAlias<"ret", (RET LR)>;
 
 let isCall = 1, Defs = [LR], Uses = [SP] in {
 def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
 } // isCall
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
 } // isBranch, isTerminator, isBarrier, isIndirectBranch
 
 // Create a separate pseudo-instruction for codegen to use so that we don't
 // flag lr as used in every function. It'll be restored before the RET by the
 // epilogue if it's legitimately used.
 def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
   let isTerminator = 1;
   let isBarrier = 1;
   let isReturn = 1;
 }
 
 // This is a directive-like pseudo-instruction. The purpose is to insert an
 // R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
 // (which in the usual case is a BLR).
 let hasSideEffects = 1 in
 def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
   let AsmString = ".tlsdesccall $sym";
 }
 
-// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
-// gets expanded to two MCInsts during lowering.
-let isCall = 1, Defs = [LR] in
-def TLSDESC_BLR
-    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
-             [(AArch64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
+// FIXME: maybe the scratch register used shouldn't be fixed to X1?
+// FIXME: can "hasSideEffects be dropped?
+let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
+    isCodeGenOnly = 1 in
+def TLSDESC_CALLSEQ
+    : Pseudo<(outs), (ins i64imm:$sym),
+             [(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>;
+def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
+          (TLSDESC_CALLSEQ texternalsym:$sym)>;
 
-def : Pat<(AArch64tlsdesc_call GPR64:$dest, texternalsym:$sym),
-          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
 //===----------------------------------------------------------------------===//
 // Conditional branch (immediate) instruction.
 //===----------------------------------------------------------------------===//
 def Bcc : BranchCond;
 
 //===----------------------------------------------------------------------===//
 // Compare-and-branch instructions.
 //===----------------------------------------------------------------------===//
 defm CBZ  : CmpBranch<0, "cbz", AArch64cbz>;
 defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;
 
 //===----------------------------------------------------------------------===//
 // Test-bit-and-branch instructions.
 //===----------------------------------------------------------------------===//
 defm TBZ  : TestBranch<0, "tbz", AArch64tbz>;
 defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;
 
 //===----------------------------------------------------------------------===//
 // Unconditional branch (immediate) instructions.
 //===----------------------------------------------------------------------===//
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 def B  : BranchImm<0, "b", [(br bb:$addr)]>;
 } // isBranch, isTerminator, isBarrier
 
 let isCall = 1, Defs = [LR], Uses = [SP] in {
 def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
 } // isCall
 def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
 
 //===----------------------------------------------------------------------===//
 // Exception generation instructions.
 //===----------------------------------------------------------------------===//
 def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
 def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
 def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
 def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
 def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
 def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
 def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
 def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
 
 // DCPSn defaults to an immediate operand of zero if unspecified.
 def : InstAlias<"dcps1", (DCPS1 0)>;
 def : InstAlias<"dcps2", (DCPS2 0)>;
 def : InstAlias<"dcps3", (DCPS3 0)>;
 
 //===----------------------------------------------------------------------===//
 // Load instructions.
 //===----------------------------------------------------------------------===//
 
 // Pair (indexed, offset)
 defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
 defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
 defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
 defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
 defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
 
 defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
 
 // Pair (pre-indexed)
 def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
 def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
 def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
 def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
 def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
 
 def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
 
 // Pair (post-indexed)
 def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
 def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
 def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
 def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
 def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
 
 def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
 
 
 // Pair (no allocate)
 defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
 defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
 defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
 defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
 defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
 
 //---
 // (register offset)
 //---
 
 // Integer
 defm LDRBB : Load8RO<0b00,  0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
 defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
 defm LDRW  : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
 defm LDRX  : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
 
 // Floating-point
 defm LDRB : Load8RO<0b00,   1, 0b01, FPR8,   "ldr", untyped, load>;
 defm LDRH : Load16RO<0b01,  1, 0b01, FPR16,  "ldr", f16, load>;
 defm LDRS : Load32RO<0b10,  1, 0b01, FPR32,  "ldr", f32, load>;
 defm LDRD : Load64RO<0b11,  1, 0b01, FPR64,  "ldr", f64, load>;
 defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
 
 // Load sign-extended half-word
 defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
 defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
 
 // Load sign-extended byte
 defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
 defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
 
 // Load sign-extended word
 defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
 
 // Pre-fetch.
 defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
 
 // For regular load, we do not have any alignment requirement.
 // Thus, it is safe to directly map the vector loads with interesting
 // addressing modes.
 // FIXME: We could do the same for bitconvert to floating point vectors.
 multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
                               ValueType ScalTy, ValueType VecTy,
                               Instruction LOADW, Instruction LOADX,
                               SubRegIndex sub> {
   def : Pat<(VecTy (scalar_to_vector (ScalTy
               (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
             (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
                            (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
                            sub)>;
 
   def : Pat<(VecTy (scalar_to_vector (ScalTy
               (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
             (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
                            (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
                            sub)>;
 }
 
 let AddedComplexity = 10 in {
 defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v8i8,  LDRBroW, LDRBroX, bsub>;
 defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v16i8, LDRBroW, LDRBroX, bsub>;
 
 defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
 defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
 
 defm : ScalToVecROLoadPat<ro16, load,       i32, v4f16, LDRHroW, LDRHroX, hsub>;
 defm : ScalToVecROLoadPat<ro16, load,       i32, v8f16, LDRHroW, LDRHroX, hsub>;
 
 defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
 defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
 
 defm : ScalToVecROLoadPat<ro32, load,       f32, v2f32, LDRSroW, LDRSroX, ssub>;
 defm : ScalToVecROLoadPat<ro32, load,       f32, v4f32, LDRSroW, LDRSroX, ssub>;
 
 defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
 
 defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
 
 
 def : Pat <(v1i64 (scalar_to_vector (i64
                       (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
                                            ro_Wextend64:$extend))))),
            (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
 
 def : Pat <(v1i64 (scalar_to_vector (i64
                       (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
                                            ro_Xextend64:$extend))))),
            (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
 }
 
 // Match all load 64 bits width whose type is compatible with FPR64
 multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
                         Instruction LOADW, Instruction LOADX> {
 
   def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
             (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
   def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
             (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
 let AddedComplexity = 10 in {
 let Predicates = [IsLE] in {
   // We must do vector loads with LD1 in big-endian.
   defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
   defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
   defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
   defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
   defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
 }
 
 defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
 defm : VecROLoadPat<ro64, v1f64,  LDRDroW, LDRDroX>;
 
 // Match all load 128 bits width whose type is compatible with FPR128
 let Predicates = [IsLE] in {
   // We must do vector loads with LD1 in big-endian.
   defm : VecROLoadPat<ro128, v2i64,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v2f64,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v4i32,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v8f16,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
 }
 } // AddedComplexity = 10
 
 // zextload -> i64
 multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
                             Instruction INSTW, Instruction INSTX> {
   def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
             (SUBREG_TO_REG (i64 0),
                            (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
                            sub_32)>;
 
   def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
             (SUBREG_TO_REG (i64 0),
                            (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
                            sub_32)>;
 }
 
 let AddedComplexity = 10 in {
   defm : ExtLoadTo64ROPat<ro8,  zextloadi8,  LDRBBroW, LDRBBroX>;
   defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
   defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW,  LDRWroX>;
 
   // zextloadi1 -> zextloadi8
   defm : ExtLoadTo64ROPat<ro8,  zextloadi1,  LDRBBroW, LDRBBroX>;
 
   // extload -> zextload
   defm : ExtLoadTo64ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
   defm : ExtLoadTo64ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
   defm : ExtLoadTo64ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
 
   // extloadi1 -> zextloadi8
   defm : ExtLoadTo64ROPat<ro8,  extloadi1,   LDRBBroW, LDRBBroX>;
 }
 
 
 // zextload -> i64
 multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
                             Instruction INSTW, Instruction INSTX> {
   def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
             (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
   def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
             (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 
 }
 
 let AddedComplexity = 10 in {
   // extload -> zextload
   defm : ExtLoadTo32ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
   defm : ExtLoadTo32ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
   defm : ExtLoadTo32ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
 
   // zextloadi1 -> zextloadi8
   defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
 }
 
 //---
 // (unsigned immediate)
 //---
 defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
                    [(set GPR64:$Rt,
                          (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
 defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
                    [(set GPR32:$Rt,
                          (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
 defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
                    [(set FPR8:$Rt,
                          (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
 defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
                    [(set (f16 FPR16:$Rt),
                          (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
 defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
                    [(set (f32 FPR32:$Rt),
                          (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
 defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
                    [(set (f64 FPR64:$Rt),
                          (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
 defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
                  [(set (f128 FPR128:$Rt),
                        (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
 
 // For regular load, we do not have any alignment requirement.
 // Thus, it is safe to directly map the vector loads with interesting
 // addressing modes.
 // FIXME: We could do the same for bitconvert to floating point vectors.
 def : Pat <(v8i8 (scalar_to_vector (i32
                (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
            (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
                           (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
 def : Pat <(v16i8 (scalar_to_vector (i32
                (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
                           (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
 def : Pat <(v4i16 (scalar_to_vector (i32
                (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
            (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
 def : Pat <(v8i16 (scalar_to_vector (i32
                (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
 def : Pat <(v2i32 (scalar_to_vector (i32
                (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
            (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
 def : Pat <(v4i32 (scalar_to_vector (i32
                (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
 def : Pat <(v1i64 (scalar_to_vector (i64
                (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
 def : Pat <(v2i64 (scalar_to_vector (i64
                (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
            (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                           (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
 
 // Match all load 64 bits width whose type is compatible with FPR64
 let Predicates = [IsLE] in {
   // We must use LD1 to perform vector loads in big-endian.
   def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
 }
 def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
 def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
 
 // Match all load 128 bits width whose type is compatible with FPR128
 let Predicates = [IsLE] in {
   // We must use LD1 to perform vector loads in big-endian.
   def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
 }
 def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
           (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
 
 defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
                     [(set GPR32:$Rt,
                           (zextloadi16 (am_indexed16 GPR64sp:$Rn,
                                                      uimm12s2:$offset)))]>;
 defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
                     [(set GPR32:$Rt,
                           (zextloadi8 (am_indexed8 GPR64sp:$Rn,
                                                    uimm12s1:$offset)))]>;
 // zextload -> i64
 def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
     (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
 def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
     (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
 
 // zextloadi1 -> zextloadi8
 def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
           (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
 def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
     (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
 
 // extload -> zextload
 def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
           (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
 def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
           (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
 def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
           (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
 def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
     (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
 def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
     (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
 def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
     (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
 def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
     (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
 
 // load sign-extended half-word
 defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
                      [(set GPR32:$Rt,
                            (sextloadi16 (am_indexed16 GPR64sp:$Rn,
                                                       uimm12s2:$offset)))]>;
 defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
                      [(set GPR64:$Rt,
                            (sextloadi16 (am_indexed16 GPR64sp:$Rn,
                                                       uimm12s2:$offset)))]>;
 
 // load sign-extended byte
 defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
                      [(set GPR32:$Rt,
                            (sextloadi8 (am_indexed8 GPR64sp:$Rn,
                                                     uimm12s1:$offset)))]>;
 defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
                      [(set GPR64:$Rt,
                            (sextloadi8 (am_indexed8 GPR64sp:$Rn,
                                                     uimm12s1:$offset)))]>;
 
 // load sign-extended word
 defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
                      [(set GPR64:$Rt,
                            (sextloadi32 (am_indexed32 GPR64sp:$Rn,
                                                       uimm12s4:$offset)))]>;
 
 // load zero-extended word
 def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
       (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
 
 // Pre-fetch.
 def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
                         [(AArch64Prefetch imm:$Rt,
                                         (am_indexed64 GPR64sp:$Rn,
                                                       uimm12s8:$offset))]>;
 
 def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
 
 //---
 // (literal)
 def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
 def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
 def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
 def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
 def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
 
 // load sign-extended word
 def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
 
 // prefetch
 def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
 //                   [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;
 
 //---
 // (unscaled immediate)
 defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
                     [(set GPR64:$Rt,
                           (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
                     [(set GPR32:$Rt,
                           (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
                     [(set FPR8:$Rt,
                           (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
                     [(set FPR16:$Rt,
                           (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
                     [(set (f32 FPR32:$Rt),
                           (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
                     [(set (f64 FPR64:$Rt),
                           (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
                     [(set (f128 FPR128:$Rt),
                           (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
 
 defm LDURHH
     : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
              [(set GPR32:$Rt,
                     (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURBB
     : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
              [(set GPR32:$Rt,
                     (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
 
 // Match all load 64 bits width whose type is compatible with FPR64
 let Predicates = [IsLE] in {
   def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
             (LDURDi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
             (LDURDi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
             (LDURDi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
             (LDURDi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
             (LDURDi GPR64sp:$Rn, simm9:$offset)>;
 }
 def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
           (LDURDi GPR64sp:$Rn, simm9:$offset)>;
 def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
           (LDURDi GPR64sp:$Rn, simm9:$offset)>;
 
 // Match all load 128 bits width whose type is compatible with FPR128
 let Predicates = [IsLE] in {
   def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
             (LDURQi GPR64sp:$Rn, simm9:$offset)>;
 }
 
 //  anyext -> zext
 def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
           (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
 def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
           (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
 def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
           (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
 def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
     (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
 def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
     (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
 def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
     (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
 def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
     (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
 // unscaled zext
 def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
           (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
 def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
           (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
 def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
           (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
 def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
     (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
 def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
     (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
 def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
     (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
 def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
     (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
 
 
 //---
 // LDR mnemonics fall back to LDUR for negative or unaligned offsets.
 
 // Define new assembler match classes as we want to only match these when
 // the don't otherwise match the scaled addressing mode for LDR/STR. Don't
 // associate a DiagnosticType either, as we want the diagnostic for the
 // canonical form (the scaled operand) to take precedence.
 class SImm9OffsetOperand<int Width> : AsmOperandClass {
   let Name = "SImm9OffsetFB" # Width;
   let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
   let RenderMethod = "addImmOperands";
 }
 
 def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
 def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
 def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
 def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
 def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
 
 def simm9_offset_fb8 : Operand<i64> {
   let ParserMatchClass = SImm9OffsetFB8Operand;
 }
 def simm9_offset_fb16 : Operand<i64> {
   let ParserMatchClass = SImm9OffsetFB16Operand;
 }
 def simm9_offset_fb32 : Operand<i64> {
   let ParserMatchClass = SImm9OffsetFB32Operand;
 }
 def simm9_offset_fb64 : Operand<i64> {
   let ParserMatchClass = SImm9OffsetFB64Operand;
 }
 def simm9_offset_fb128 : Operand<i64> {
   let ParserMatchClass = SImm9OffsetFB128Operand;
 }
 
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
                 (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
                 (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
                 (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
                 (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
                 (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
                 (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
 def : InstAlias<"ldr $Rt, [$Rn, $offset]",
                (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
 
 // zextload -> i64
 def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
   (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
 def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
   (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
 
 // load sign-extended half-word
 defm LDURSHW
     : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
                [(set GPR32:$Rt,
                     (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURSHX
     : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
               [(set GPR64:$Rt,
                     (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
 
 // load sign-extended byte
 defm LDURSBW
     : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
                 [(set GPR32:$Rt,
                       (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURSBX
     : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
                 [(set GPR64:$Rt,
                       (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
 
 // load sign-extended word
 defm LDURSW
     : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
               [(set GPR64:$Rt,
                     (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
 
 // zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
 def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
                 (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
 def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
                 (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
 def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
                 (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
 def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
                 (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
 def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
                 (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
 def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
                 (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
 def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
                 (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
 
 // Pre-fetch.
 defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
                   [(AArch64Prefetch imm:$Rt,
                                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
 
 //---
 // (unscaled immediate, unprivileged)
 defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
 defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
 
 defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
 defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
 
 // load sign-extended half-word
 defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
 defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
 
 // load sign-extended byte
 defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
 defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
 
 // load sign-extended word
 defm LDTRSW  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
 
 //---
 // (immediate pre-indexed)
 def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
 def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
 def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
 def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
 def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
 def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
 def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
 
 // load sign-extended half-word
 def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
 def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
 
 // load sign-extended byte
 def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
 def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
 
 // load zero-extended byte
 def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
 def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
 
 // load sign-extended word
 def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
 
 //---
 // (immediate post-indexed)
 def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
 def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
 def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
 def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
 def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
 def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
 def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
 
 // load sign-extended half-word
 def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
 def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
 
 // load sign-extended byte
 def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
 def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
 
 // load zero-extended byte
 def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
 def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
 
 // load sign-extended word
 def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
 
 //===----------------------------------------------------------------------===//
 // Store instructions.
 //===----------------------------------------------------------------------===//
 
 // Pair (indexed, offset)
 // FIXME: Use dedicated range-checked addressing mode operand here.
 defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
 defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
 defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
 defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
 defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
 
 // Pair (pre-indexed)
 def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
 def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
 def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
 def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
 def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
 
 // Pair (pre-indexed)
 def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
 def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
 def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
 def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
 def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
 
 // Pair (no allocate)
 defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
 defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
 defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
 defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
 defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
 
 //---
 // (Register offset)
 
 // Integer
 defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
 defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
 defm STRW  : Store32RO<0b10, 0, 0b00, GPR32, "str",  i32, store>;
 defm STRX  : Store64RO<0b11, 0, 0b00, GPR64, "str",  i64, store>;
 
 
 // Floating-point
 defm STRB : Store8RO< 0b00,  1, 0b00, FPR8,   "str", untyped, store>;
 defm STRH : Store16RO<0b01,  1, 0b00, FPR16,  "str", f16,     store>;
 defm STRS : Store32RO<0b10,  1, 0b00, FPR32,  "str", f32,     store>;
 defm STRD : Store64RO<0b11,  1, 0b00, FPR64,  "str", f64,     store>;
 defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128,    store>;
 
 multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
                                  Instruction STRW, Instruction STRX> {
 
   def : Pat<(storeop GPR64:$Rt,
                      (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
             (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
                   GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
   def : Pat<(storeop GPR64:$Rt,
                      (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
             (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
                   GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
 let AddedComplexity = 10 in {
   // truncstore i64
   defm : TruncStoreFrom64ROPat<ro8,  truncstorei8,  STRBBroW, STRBBroX>;
   defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
   defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW,  STRWroX>;
 }
 
 multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
                          Instruction STRW, Instruction STRX> {
   def : Pat<(store (VecTy FPR:$Rt),
                    (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
             (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
   def : Pat<(store (VecTy FPR:$Rt),
                    (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
             (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
 let AddedComplexity = 10 in {
 // Match all store 64 bits width whose type is compatible with FPR64
 let Predicates = [IsLE] in {
   // We must use ST1 to store vectors in big-endian.
   defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
   defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
   defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
   defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
   defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
 }
 
 defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
 defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
 
 // Match all store 128 bits width whose type is compatible with FPR128
 let Predicates = [IsLE] in {
   // We must use ST1 to store vectors in big-endian.
   defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
 }
 } // AddedComplexity = 10
 
 // Match stores from lane 0 to the appropriate subreg's store.
 multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
                               ValueType VecTy, ValueType STy,
                               SubRegIndex SubRegIdx,
                               Instruction STRW, Instruction STRX> {
 
   def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
                      (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
             (STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
                   GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
   def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
                      (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
             (STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
                   GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
 let AddedComplexity = 19 in {
   defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
   defm : VecROStoreLane0Pat<ro16,      store   , v8i16, i16, hsub, STRHroW, STRHroX>;
   defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
   defm : VecROStoreLane0Pat<ro32,      store   , v4i32, i32, ssub, STRSroW, STRSroX>;
   defm : VecROStoreLane0Pat<ro32,      store   , v4f32, f32, ssub, STRSroW, STRSroX>;
   defm : VecROStoreLane0Pat<ro64,      store   , v2i64, i64, dsub, STRDroW, STRDroX>;
   defm : VecROStoreLane0Pat<ro64,      store   , v2f64, f64, dsub, STRDroW, STRDroX>;
 }
 
 //---
 // (unsigned immediate)
 defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
                    [(store GPR64:$Rt,
                             (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
 defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
                     [(store GPR32:$Rt,
                             (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
 defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
                     [(store FPR8:$Rt,
                             (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
 defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
                     [(store (f16 FPR16:$Rt),
                             (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
 defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
                     [(store (f32 FPR32:$Rt),
                             (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
 defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
                     [(store (f64 FPR64:$Rt),
                             (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
 defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
 
 defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
                      [(truncstorei16 GPR32:$Rt,
                                      (am_indexed16 GPR64sp:$Rn,
                                                    uimm12s2:$offset))]>;
 defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1,  "strb",
                      [(truncstorei8 GPR32:$Rt,
                                     (am_indexed8 GPR64sp:$Rn,
                                                  uimm12s1:$offset))]>;
 
 // Match all store 64 bits width whose type is compatible with FPR64
 let AddedComplexity = 10 in {
 let Predicates = [IsLE] in {
   // We must use ST1 to store vectors in big-endian.
   def : Pat<(store (v2f32 FPR64:$Rt),
                    (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
             (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(store (v8i8 FPR64:$Rt),
                    (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
             (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(store (v4i16 FPR64:$Rt),
                    (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
             (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(store (v2i32 FPR64:$Rt),
                    (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
             (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(store (v4f16 FPR64:$Rt),
                    (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
             (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
 }
 def : Pat<(store (v1f64 FPR64:$Rt),
                  (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
           (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
 def : Pat<(store (v1i64 FPR64:$Rt),
                  (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
           (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
 
 // Match all store 128 bits width whose type is compatible with FPR128
 let Predicates = [IsLE] in {
   // We must use ST1 to store vectors in big-endian.
   def : Pat<(store (v4f32 FPR128:$Rt),
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(store (v2f64 FPR128:$Rt),
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(store (v16i8 FPR128:$Rt),
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(store (v8i16 FPR128:$Rt),
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(store (v4i32 FPR128:$Rt),
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(store (v2i64 FPR128:$Rt),
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(store (v8f16 FPR128:$Rt),
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
 }
 def : Pat<(store (f128  FPR128:$Rt),
                  (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
           (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
 
 // truncstore i64
 def : Pat<(truncstorei32 GPR64:$Rt,
                          (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
   (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
 def : Pat<(truncstorei16 GPR64:$Rt,
                          (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
   (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
 def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
   (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
 
 } // AddedComplexity = 10
 
 //---
 // (unscaled immediate)
 defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
                          [(store GPR64:$Rt,
                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
 defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
                          [(store GPR32:$Rt,
                                  (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
 defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
                          [(store FPR8:$Rt,
                                  (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
 defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
                          [(store (f16 FPR16:$Rt),
                                  (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
 defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
                          [(store (f32 FPR32:$Rt),
                                  (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
 defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
                          [(store (f64 FPR64:$Rt),
                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
 defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
                          [(store (f128 FPR128:$Rt),
                                  (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
 defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
                          [(truncstorei16 GPR32:$Rt,
                                  (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
 defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
                          [(truncstorei8 GPR32:$Rt,
                                   (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
 
 // Match all store 64 bits width whose type is compatible with FPR64
 let Predicates = [IsLE] in {
   // We must use ST1 to store vectors in big-endian.
   def : Pat<(store (v2f32 FPR64:$Rt),
                    (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
             (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v8i8 FPR64:$Rt),
                    (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
             (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v4i16 FPR64:$Rt),
                    (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
             (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v2i32 FPR64:$Rt),
                    (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
             (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v4f16 FPR64:$Rt),
                    (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
             (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
           (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
           (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 
 // Match all store 128 bits width whose type is compatible with FPR128
 let Predicates = [IsLE] in {
   // We must use ST1 to store vectors in big-endian.
   def : Pat<(store (v4f32 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v2f64 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v16i8 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v8i16 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v4i32 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v2i64 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v2f64 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
   def : Pat<(store (v8f16 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 
 // unscaled i64 truncating stores
 def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
   (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
 def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
   (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
 def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
   (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
 
 //---
 // STR mnemonics fall back to STUR for negative or unaligned offsets.
 def : InstAlias<"str $Rt, [$Rn, $offset]",
                 (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
                 (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
                 (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
                 (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
                 (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
                 (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
 def : InstAlias<"str $Rt, [$Rn, $offset]",
                 (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
 
 def : InstAlias<"strb $Rt, [$Rn, $offset]",
                 (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
 def : InstAlias<"strh $Rt, [$Rn, $offset]",
                 (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
 
 //---
 // (unscaled immediate, unprivileged)
 defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
 defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
 
 defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
 defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
 
 //---
 // (immediate pre-indexed)
 def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str",  pre_store, i32>;
 def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str",  pre_store, i64>;
 def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str",  pre_store, untyped>;
 def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str",  pre_store, f16>;
 def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str",  pre_store, f32>;
 def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str",  pre_store, f64>;
 def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
 
 def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8,  i32>;
 def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
 
 // truncstore i64
 def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
   (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
            simm9:$off)>;
 def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
   (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
             simm9:$off)>;
 def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
   (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
             simm9:$off)>;
 
 def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 //---
 // (immediate post-indexed)
 def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32,  "str", post_store, i32>;
 def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64,  "str", post_store, i64>;
 def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,   "str", post_store, untyped>;
 def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16,  "str", post_store, f16>;
 def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32,  "str", post_store, f32>;
 def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64,  "str", post_store, f64>;
 def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
 
 def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
 def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
 
 // truncstore i64
 def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
   (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
             simm9:$off)>;
 def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
   (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
              simm9:$off)>;
 def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
   (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
              simm9:$off)>;
 
 def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
           (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
           (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 //===----------------------------------------------------------------------===//
 // Load/store exclusive instructions.
 //===----------------------------------------------------------------------===//
 
 def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
 def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
 def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
 def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
 
 def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
 def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
 def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
 def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
 
 def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
 def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
 def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
 def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
 
 def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
 def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
 def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
 def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
 
 def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
 def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
 def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
 def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
 
 def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
 def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
 def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
 def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
 
 def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
 def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
 
 def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
 def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
 
 def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
 def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
 
 def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
 def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
 
 //===----------------------------------------------------------------------===//
 // Scaled floating point to integer conversion instructions.
 //===----------------------------------------------------------------------===//
 
 defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
 defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
 defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
 defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
 defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
 defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
 defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
 defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
 defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
 defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
 defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
 defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
 let isCodeGenOnly = 1 in {
 defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
 defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
 defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
 defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
 }
 
 //===----------------------------------------------------------------------===//
 // Scaled integer to floating point conversion instructions.
 //===----------------------------------------------------------------------===//
 
 defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
 defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
 
 //===----------------------------------------------------------------------===//
 // Unscaled integer to floating point conversion instruction.
 //===----------------------------------------------------------------------===//
 
 defm FMOV : UnscaledConversion<"fmov">;
 
 def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
 def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
 
 //===----------------------------------------------------------------------===//
 // Floating point conversion instruction.
 //===----------------------------------------------------------------------===//
 
 defm FCVT : FPConversion<"fcvt">;
 
 //===----------------------------------------------------------------------===//
 // Floating point single operand instructions.
 //===----------------------------------------------------------------------===//
 
 defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
 defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
 defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
 defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
 defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
 defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
 defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
 defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
 
 def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
           (FRINTNDr FPR64:$Rn)>;
 
 // FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
 // in the C spec. Setting hasSideEffects ensures it is not DCE'd.
 // <rdar://problem/13715968>
 // TODO: We should really model the FPSR flags correctly. This is really ugly.
 let hasSideEffects = 1 in {
 defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
 }
 
 defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
 
 let SchedRW = [WriteFDiv] in {
 defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
 }
 
 //===----------------------------------------------------------------------===//
 // Floating point two operand instructions.
 //===----------------------------------------------------------------------===//
 
 defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
 let SchedRW = [WriteFDiv] in {
 defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
 }
 defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>;
 defm FMAX   : TwoOperandFPData<0b0100, "fmax", AArch64fmax>;
 defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>;
 defm FMIN   : TwoOperandFPData<0b0101, "fmin", AArch64fmin>;
 let SchedRW = [WriteFMul] in {
 defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
 defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
 }
 defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
 
 def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
 def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
 def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
 def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
 
 //===----------------------------------------------------------------------===//
 // Floating point three operand instructions.
 //===----------------------------------------------------------------------===//
 
 defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
 defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
      TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
 defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
      TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
 defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
      TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
 
 // The following def pats catch the case where the LHS of an FMA is negated.
 // The TriOpFrag above catches the case where the middle operand is negated.
 
 // N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
 // the NEON variant.
 def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
           (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
 def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
           (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
 // We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
 // "(-a) + b*(-c)".
 def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
           (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
 def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
           (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
 def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
           (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
 def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
           (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
 //===----------------------------------------------------------------------===//
 // Floating point comparison instructions.
 //===----------------------------------------------------------------------===//
 
 defm FCMPE : FPComparison<1, "fcmpe">;
 defm FCMP  : FPComparison<0, "fcmp", AArch64fcmp>;
 
 //===----------------------------------------------------------------------===//
 // Floating point conditional comparison instructions.
 //===----------------------------------------------------------------------===//
 
 defm FCCMPE : FPCondComparison<1, "fccmpe">;
 defm FCCMP  : FPCondComparison<0, "fccmp">;
 
 //===----------------------------------------------------------------------===//
 // Floating point conditional select instruction.
 //===----------------------------------------------------------------------===//
 
 defm FCSEL : FPCondSelect<"fcsel">;
 
 // CSEL instructions providing f128 types need to be handled by a
 // pseudo-instruction since the eventual code will need to introduce basic
 // blocks and control flow.
 def F128CSEL : Pseudo<(outs FPR128:$Rd),
                       (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
                       [(set (f128 FPR128:$Rd),
                             (AArch64csel FPR128:$Rn, FPR128:$Rm,
                                        (i32 imm:$cond), NZCV))]> {
   let Uses = [NZCV];
   let usesCustomInserter = 1;
 }
 
 
 //===----------------------------------------------------------------------===//
 // Floating point immediate move.
 //===----------------------------------------------------------------------===//
 
 let isReMaterializable = 1 in {
 defm FMOV : FPMoveImmediate<"fmov">;
 }
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD two vector instructions.
 //===----------------------------------------------------------------------===//
 
 defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
 def : Pat<(xor (v8i8 (AArch64vashr V64:$src, (i32 7))),
                (v8i8 (add V64:$src, (AArch64vashr V64:$src, (i32 7))))),
           (ABSv8i8 V64:$src)>;
 def : Pat<(xor (v4i16 (AArch64vashr V64:$src, (i32 15))),
                (v4i16 (add V64:$src, (AArch64vashr V64:$src, (i32 15))))),
           (ABSv4i16 V64:$src)>;
 def : Pat<(xor (v2i32 (AArch64vashr V64:$src, (i32 31))),
                (v2i32 (add V64:$src, (AArch64vashr V64:$src, (i32 31))))),
           (ABSv2i32 V64:$src)>;
 def : Pat<(xor (v16i8 (AArch64vashr V128:$src, (i32 7))),
                (v16i8 (add V128:$src, (AArch64vashr V128:$src, (i32 7))))),
           (ABSv16i8 V128:$src)>;
 def : Pat<(xor (v8i16 (AArch64vashr V128:$src, (i32 15))),
                (v8i16 (add V128:$src, (AArch64vashr V128:$src, (i32 15))))),
           (ABSv8i16 V128:$src)>;
 def : Pat<(xor (v4i32 (AArch64vashr V128:$src, (i32 31))),
                (v4i32 (add V128:$src, (AArch64vashr V128:$src, (i32 31))))),
           (ABSv4i32 V128:$src)>;
 def : Pat<(xor (v2i64 (AArch64vashr V128:$src, (i32 63))),
                (v2i64 (add V128:$src, (AArch64vashr V128:$src, (i32 63))))),
           (ABSv2i64 V128:$src)>;
 
 defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
 defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
 defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
 defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
 defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
 defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
 defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
 defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
 defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
 
 defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
 defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
 defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
 defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
 defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
 defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
 defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
 defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
 def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
           (FCVTLv4i16 V64:$Rn)>;
 def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
                                                               (i64 4)))),
           (FCVTLv8i16 V128:$Rn)>;
 def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
 def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
                                                     (i64 2))))),
           (FCVTLv4i32 V128:$Rn)>;
 
 def : Pat<(v4f32 (fextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
 def : Pat<(v4f32 (fextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
                                                     (i64 4))))),
           (FCVTLv8i16 V128:$Rn)>;
 
 defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
 defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
 defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
 defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
 defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
 def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
           (FCVTNv4i16 V128:$Rn)>;
 def : Pat<(concat_vectors V64:$Rd,
                           (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
           (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
 def : Pat<(v4f16 (fround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
 def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
           (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
 defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
 defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
                                         int_aarch64_neon_fcvtxn>;
 defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
 defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
 let isCodeGenOnly = 1 in {
 defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
                                        int_aarch64_neon_fcvtzs>;
 defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
                                        int_aarch64_neon_fcvtzu>;
 }
 defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
 defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
 defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
 defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
 defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
 defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
 defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
 defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
 defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
 defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
 defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
 defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
                                UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
 // Aliases for MVN -> NOT.
 def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
                 (NOTv8i8 V64:$Vd, V64:$Vn)>;
 def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
                 (NOTv16i8 V128:$Vd, V128:$Vn)>;
 
 def : Pat<(AArch64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
 def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
 def : Pat<(AArch64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
 def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
 def : Pat<(AArch64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
 def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
 def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
 
 def : Pat<(AArch64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
 def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
 def : Pat<(AArch64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
 def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
 def : Pat<(AArch64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
 def : Pat<(AArch64not (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
 def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
 def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
 
 def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
 def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
 def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
 def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
 def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
 
 defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
 defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
 defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
 defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
 defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
        BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
 defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
 defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
 defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
 defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
 defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
 defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
 defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
 defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
 defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
        BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
 defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
                     int_aarch64_neon_uaddlp>;
 defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
 defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
 defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
 defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
 defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
 defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
 
 def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
 def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
 def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
 def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
 def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
 def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
 
 // Patterns for vector long shift (by element width). These need to match all
 // three of zext, sext and anyext so it's easier to pull the patterns out of the
 // definition.
 multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
   def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
             (SHLLv8i8 V64:$Rn)>;
   def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
             (SHLLv16i8 V128:$Rn)>;
   def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
             (SHLLv4i16 V64:$Rn)>;
   def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
             (SHLLv8i16 V128:$Rn)>;
   def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
             (SHLLv2i32 V64:$Rn)>;
   def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
             (SHLLv4i32 V128:$Rn)>;
 }
 
 defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
 defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
 defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three vector instructions.
 //===----------------------------------------------------------------------===//
 
 defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
 defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
 defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
 defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
 defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
 defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
 defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
 defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
 defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
 defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
 defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
 defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
 defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
 defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
 defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
 defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
 defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
 defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
 defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>;
 defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
 defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>;
 defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
 defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>;
 defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
 defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>;
 
 // NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
 // instruction expects the addend first, while the fma intrinsic puts it last.
 defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
             TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
 defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
             TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 
 // The following def pats catch the case where the LHS of an FMA is negated.
 // The TriOpFrag above catches the case where the middle operand is negated.
 def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
           (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
 
 def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
           (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
 def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
           (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
 defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
 defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
 defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
 defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
 defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
 defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
                       TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
 defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
                       TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
 defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
 defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
 defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
       TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
 defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
 defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
 defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
 defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
 defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>;
 defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
 defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>;
 defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
 defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
 defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
 defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
 defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
 defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
 defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
 defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
 defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
       TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
 defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
 defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
 defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
 defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
 defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>;
 defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
 defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>;
 defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
 defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
 defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
 defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
 defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
 defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
 
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
                                   BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
 defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
 defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
 defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
     TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
 defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
 defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
                                   BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
 defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
 
 def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
           (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
 def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
           (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
 def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
           (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
 def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
           (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
 
 def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
           (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
 def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
           (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
 def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
           (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
 def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
           (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
 def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
                 (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
 def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
                 (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
 def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
                 (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
 def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
                 (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
 
 def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
                 (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
 def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
                 (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
 def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
                 (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
 def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
                 (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
 
 def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
                 "|cmls.8b\t$dst, $src1, $src2}",
                 (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
                 "|cmls.16b\t$dst, $src1, $src2}",
                 (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
                 "|cmls.4h\t$dst, $src1, $src2}",
                 (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
                 "|cmls.8h\t$dst, $src1, $src2}",
                 (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
                 "|cmls.2s\t$dst, $src1, $src2}",
                 (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
                 "|cmls.4s\t$dst, $src1, $src2}",
                 (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
                 "|cmls.2d\t$dst, $src1, $src2}",
                 (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
                 "|cmlo.8b\t$dst, $src1, $src2}",
                 (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
                 "|cmlo.16b\t$dst, $src1, $src2}",
                 (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
                 "|cmlo.4h\t$dst, $src1, $src2}",
                 (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
                 "|cmlo.8h\t$dst, $src1, $src2}",
                 (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
                 "|cmlo.2s\t$dst, $src1, $src2}",
                 (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
                 "|cmlo.4s\t$dst, $src1, $src2}",
                 (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
                 "|cmlo.2d\t$dst, $src1, $src2}",
                 (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
                 "|cmle.8b\t$dst, $src1, $src2}",
                 (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
                 "|cmle.16b\t$dst, $src1, $src2}",
                 (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
                 "|cmle.4h\t$dst, $src1, $src2}",
                 (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
                 "|cmle.8h\t$dst, $src1, $src2}",
                 (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
                 "|cmle.2s\t$dst, $src1, $src2}",
                 (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
                 "|cmle.4s\t$dst, $src1, $src2}",
                 (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
                 "|cmle.2d\t$dst, $src1, $src2}",
                 (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
                 "|cmlt.8b\t$dst, $src1, $src2}",
                 (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
                 "|cmlt.16b\t$dst, $src1, $src2}",
                 (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
                 "|cmlt.4h\t$dst, $src1, $src2}",
                 (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
                 "|cmlt.8h\t$dst, $src1, $src2}",
                 (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
                 "|cmlt.2s\t$dst, $src1, $src2}",
                 (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
                 "|cmlt.4s\t$dst, $src1, $src2}",
                 (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
                 "|cmlt.2d\t$dst, $src1, $src2}",
                 (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
                 "|fcmle.2s\t$dst, $src1, $src2}",
                 (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
                 "|fcmle.4s\t$dst, $src1, $src2}",
                 (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
                 "|fcmle.2d\t$dst, $src1, $src2}",
                 (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
                 "|fcmlt.2s\t$dst, $src1, $src2}",
                 (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
                 "|fcmlt.4s\t$dst, $src1, $src2}",
                 (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
                 "|fcmlt.2d\t$dst, $src1, $src2}",
                 (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
                 "|facle.2s\t$dst, $src1, $src2}",
                 (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
                 "|facle.4s\t$dst, $src1, $src2}",
                 (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
                 "|facle.2d\t$dst, $src1, $src2}",
                 (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
                 "|faclt.2s\t$dst, $src1, $src2}",
                 (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
 def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
                 "|faclt.4s\t$dst, $src1, $src2}",
                 (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
 def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
                 "|faclt.2d\t$dst, $src1, $src2}",
                 (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three scalar instructions.
 //===----------------------------------------------------------------------===//
 
 defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
 defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
 defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
 defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
 defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
 defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
 defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
 defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
 def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
           (FABD64 FPR64:$Rn, FPR64:$Rm)>;
 defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
                                      int_aarch64_neon_facge>;
 defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
                                      int_aarch64_neon_facgt>;
 defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
 defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
 defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
 defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
 defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
 defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
 defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
 defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
 defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
 defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
 defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
 defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_aarch64_neon_srshl>;
 defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_aarch64_neon_sshl>;
 defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
 defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
 defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
 defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
 defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
 defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
 
 def : InstAlias<"cmls $dst, $src1, $src2",
                 (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 def : InstAlias<"cmle $dst, $src1, $src2",
                 (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 def : InstAlias<"cmlo $dst, $src1, $src2",
                 (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 def : InstAlias<"cmlt $dst, $src1, $src2",
                 (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 def : InstAlias<"fcmle $dst, $src1, $src2",
                 (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
 def : InstAlias<"fcmle $dst, $src1, $src2",
                 (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 def : InstAlias<"fcmlt $dst, $src1, $src2",
                 (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
 def : InstAlias<"fcmlt $dst, $src1, $src2",
                 (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 def : InstAlias<"facle $dst, $src1, $src2",
                 (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
 def : InstAlias<"facle $dst, $src1, $src2",
                 (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 def : InstAlias<"faclt $dst, $src1, $src2",
                 (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
 def : InstAlias<"faclt $dst, $src1, $src2",
                 (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three scalar instructions (mixed operands).
 //===----------------------------------------------------------------------===//
 defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
                                        int_aarch64_neon_sqdmulls_scalar>;
 defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
 defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
 
 def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
                    (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
                                                         (i32 FPR32:$Rm))))),
           (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
 def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
                    (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
                                                         (i32 FPR32:$Rm))))),
           (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD two scalar instructions.
 //===----------------------------------------------------------------------===//
 
 defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_aarch64_neon_abs>;
 defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
 defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
 defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
 defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
 defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
 defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
 defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
 defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
 defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
 defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
 defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
 defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
 defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
 defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
 defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
 defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
 defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
 defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
 def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
 defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
 defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
 defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
 defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
 defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
 defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
                                  UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
 defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", AArch64sitof>;
 defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
 defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
 defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
 defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
 defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
                                      int_aarch64_neon_suqadd>;
 defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
 defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
 defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
                                     int_aarch64_neon_usqadd>;
 
 def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
 
 def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
           (FCVTASv1i64 FPR64:$Rn)>;
 def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
           (FCVTAUv1i64 FPR64:$Rn)>;
 def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
           (FCVTMSv1i64 FPR64:$Rn)>;
 def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
           (FCVTMUv1i64 FPR64:$Rn)>;
 def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
           (FCVTNSv1i64 FPR64:$Rn)>;
 def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
           (FCVTNUv1i64 FPR64:$Rn)>;
 def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
           (FCVTPSv1i64 FPR64:$Rn)>;
 def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
           (FCVTPUv1i64 FPR64:$Rn)>;
 
 def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
           (FRECPEv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
           (FRECPEv1i64 FPR64:$Rn)>;
 def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
           (FRECPEv1i64 FPR64:$Rn)>;
 
 def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
           (FRECPXv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
           (FRECPXv1i64 FPR64:$Rn)>;
 
 def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
           (FRSQRTEv1i32 FPR32:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
           (FRSQRTEv1i64 FPR64:$Rn)>;
 def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
           (FRSQRTEv1i64 FPR64:$Rn)>;
 
 // If an integer is about to be converted to a floating point value,
 // just load it on the floating point unit.
 // Here are the patterns for 8 and 16-bits to float.
 // 8-bits -> float.
 multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
                              SDPatternOperator loadop, Instruction UCVTF,
                              ROAddrMode ro, Instruction LDRW, Instruction LDRX,
                              SubRegIndex sub> {
   def : Pat<(DstTy (uint_to_fp (SrcTy
                      (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
                                       ro.Wext:$extend))))),
            (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
                                  (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
                                  sub))>;
 
   def : Pat<(DstTy (uint_to_fp (SrcTy
                      (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
                                       ro.Wext:$extend))))),
            (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
                                  (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
                                  sub))>;
 }
 
 defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
                          UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
 def : Pat <(f32 (uint_to_fp (i32
                (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
            (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
                           (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
 def : Pat <(f32 (uint_to_fp (i32
                      (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
            (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
                           (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
 // 16-bits -> float.
 defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
                          UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
 def : Pat <(f32 (uint_to_fp (i32
                   (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
            (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
 def : Pat <(f32 (uint_to_fp (i32
                   (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
            (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
                           (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
 // 32-bits are handled in target specific dag combine:
 // performIntToFpCombine.
 // 64-bits integer to 32-bits floating point, not possible with
 // UCVTF on floating point registers (both source and destination
 // must have the same size).
 
 // Here are the patterns for 8, 16, 32, and 64-bits to double.
 // 8-bits -> double.
 defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
                          UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
 def : Pat <(f64 (uint_to_fp (i32
                     (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
            (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
                           (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
 def : Pat <(f64 (uint_to_fp (i32
                   (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
            (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
                           (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
 // 16-bits -> double.
 defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
                          UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
 def : Pat <(f64 (uint_to_fp (i32
                   (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
            (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
 def : Pat <(f64 (uint_to_fp (i32
                   (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
            (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
                           (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
 // 32-bits -> double.
 defm : UIntToFPROLoadPat<f64, i32, load,
                          UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
 def : Pat <(f64 (uint_to_fp (i32
                   (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
            (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
 def : Pat <(f64 (uint_to_fp (i32
                   (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
            (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
                           (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
 // 64-bits -> double are handled in target specific dag combine:
 // performIntToFpCombine.
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three different-sized vector instructions.
 //===----------------------------------------------------------------------===//
 
 defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
 defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
 defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
 defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
 defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
 defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
                                              int_aarch64_neon_sabd>;
 defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
                                           int_aarch64_neon_sabd>;
 defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
             BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
 defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
                  BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
 defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
     TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
 defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
     TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
 defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
 defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
                                                int_aarch64_neon_sqadd>;
 defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
                                                int_aarch64_neon_sqsub>;
 defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
                                      int_aarch64_neon_sqdmull>;
 defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
                  BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
 defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
                  BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
 defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
                                               int_aarch64_neon_uabd>;
 defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
                                           int_aarch64_neon_uabd>;
 defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
                  BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
 defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
                  BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
 defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
     TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
 defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
     TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
 defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
 defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
                  BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
 defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
                  BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
 
 // Additional patterns for SMULL and UMULL
 multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
   Instruction INST8B, Instruction INST4H, Instruction INST2S> {
   def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
             (INST8B V64:$Rn, V64:$Rm)>;
   def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
             (INST4H V64:$Rn, V64:$Rm)>;
   def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
             (INST2S V64:$Rn, V64:$Rm)>;
 }
 
 defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
   SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
 defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
   UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
 
 // Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
 multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
   Instruction INST8B, Instruction INST4H, Instruction INST2S> {
   def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
             (INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
   def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
             (INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
   def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
             (INST2S  V128:$Rd, V64:$Rn, V64:$Rm)>;
 }
 
 defm : Neon_mulacc_widen_patterns<
   TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
   SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
 defm : Neon_mulacc_widen_patterns<
   TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
   UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
 defm : Neon_mulacc_widen_patterns<
   TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
   SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
 defm : Neon_mulacc_widen_patterns<
   TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
   UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;
 
 // Patterns for 64-bit pmull
 def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
           (PMULLv1i64 V64:$Rn, V64:$Rm)>;
 def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
                                   (vector_extract (v2i64 V128:$Rm), (i64 1))),
           (PMULLv2i64 V128:$Rn, V128:$Rm)>;
 
 // CodeGen patterns for addhn and subhn instructions, which can actually be
 // written in LLVM IR without too much difficulty.
 
 // ADDHN
 def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
           (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
 def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
                                            (i32 16))))),
           (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
 def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
                                            (i32 32))))),
           (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
 def : Pat<(concat_vectors (v8i8 V64:$Rd),
                           (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
                                                     (i32 8))))),
           (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
                             V128:$Rn, V128:$Rm)>;
 def : Pat<(concat_vectors (v4i16 V64:$Rd),
                           (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
                                                     (i32 16))))),
           (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
                             V128:$Rn, V128:$Rm)>;
 def : Pat<(concat_vectors (v2i32 V64:$Rd),
                           (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
                                                     (i32 32))))),
           (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
                             V128:$Rn, V128:$Rm)>;
 
 // SUBHN
 def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
           (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
 def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
                                            (i32 16))))),
           (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
 def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
                                            (i32 32))))),
           (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
 def : Pat<(concat_vectors (v8i8 V64:$Rd),
                           (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
                                                     (i32 8))))),
           (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
                             V128:$Rn, V128:$Rm)>;
 def : Pat<(concat_vectors (v4i16 V64:$Rd),
                           (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
                                                     (i32 16))))),
           (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
                             V128:$Rn, V128:$Rm)>;
 def : Pat<(concat_vectors (v2i32 V64:$Rd),
                           (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
                                                     (i32 32))))),
           (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
                             V128:$Rn, V128:$Rm)>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD bitwise extract from vector instruction.
 //----------------------------------------------------------------------------
 
 defm EXT : SIMDBitwiseExtract<"ext">;
 
 def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
           (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
 def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
           (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
 def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
           (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
 def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
           (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
 def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
           (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
 def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
           (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
 def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
           (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
 def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
           (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
 def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
           (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
 def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
           (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
 
 // We use EXT to handle extract_subvector to copy the upper 64-bits of a
 // 128-bit vector.
 def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
           (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
 
 
 //----------------------------------------------------------------------------
 // AdvSIMD zip vector
 //----------------------------------------------------------------------------
 
 defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
 defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
 defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
 defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
 defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
 defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD TBL/TBX instructions
 //----------------------------------------------------------------------------
 
 defm TBL : SIMDTableLookup<    0, "tbl">;
 defm TBX : SIMDTableLookupTied<1, "tbx">;
 
 def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
           (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
 def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
           (TBLv16i8One V128:$Ri, V128:$Rn)>;
 
 def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
                   (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
           (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
 def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
                    (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
           (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
 
 
 //----------------------------------------------------------------------------
 // AdvSIMD scalar CPY instruction
 //----------------------------------------------------------------------------
 
 defm CPY : SIMDScalarCPY<"cpy">;
 
 //----------------------------------------------------------------------------
 // AdvSIMD scalar pairwise instructions
 //----------------------------------------------------------------------------
 
 defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
 defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
 defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
 defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
 defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
 defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
 def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))),
           (ADDPv2i64p V128:$Rn)>;
 def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))),
           (ADDPv2i64p V128:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
           (FADDPv2i32p V64:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
           (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
 def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
           (FADDPv2i64p V128:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
           (FMAXNMPv2i32p V64:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
           (FMAXNMPv2i64p V128:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
           (FMAXPv2i32p V64:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
           (FMAXPv2i64p V128:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
           (FMINNMPv2i32p V64:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
           (FMINNMPv2i64p V128:$Rn)>;
 def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
           (FMINPv2i32p V64:$Rn)>;
 def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
           (FMINPv2i64p V128:$Rn)>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD INS/DUP instructions
 //----------------------------------------------------------------------------
 
 def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
 def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
 def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
 def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
 def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
 def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
 def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
 
 def DUPv2i64lane : SIMDDup64FromElement;
 def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
 def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
 def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
 def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
 def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
 def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
 
 def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
           (v2f32 (DUPv2i32lane
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
             (i64 0)))>;
 def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
           (v4f32 (DUPv4i32lane
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
             (i64 0)))>;
 def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
           (v2f64 (DUPv2i64lane
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
             (i64 0)))>;
 def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
           (v4f16 (DUPv4i16lane
             (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
             (i64 0)))>;
 def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
           (v8f16 (DUPv8i16lane
             (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
             (i64 0)))>;
 
 def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
           (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
 def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
           (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
 
 def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
           (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
 def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
          (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
 def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
           (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
 
 // If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
 // instruction even if the types don't match: we just have to remap the lane
 // carefully. N.b. this trick only applies to truncations.
 def VecIndex_x2 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(2 * N->getZExtValue(), MVT::i64);
 }]>;
 def VecIndex_x4 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(4 * N->getZExtValue(), MVT::i64);
 }]>;
 def VecIndex_x8 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(8 * N->getZExtValue(), MVT::i64);
 }]>;
 
 multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
                             ValueType Src128VT, ValueType ScalVT,
                             Instruction DUP, SDNodeXForm IdxXFORM> {
   def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
                                                      imm:$idx)))),
             (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
 
   def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
                                                      imm:$idx)))),
             (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
 }
 
 defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
 defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
 defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
 
 defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
 defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
 defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
 
 multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
                                SDNodeXForm IdxXFORM> {
   def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
                                                          imm:$idx))))),
             (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
 
   def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
                                                          imm:$idx))))),
             (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
 }
 
 defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
 defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
 defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
 
 defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
 defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
 defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
 
 // SMOV and UMOV definitions, with some extra patterns for convenience
 defm SMOV : SMov;
 defm UMOV : UMov;
 
 def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
           (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
 def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
           (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
 def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
           (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
 def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
           (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
 def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
           (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
 def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
           (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
 
 // Extracting i8 or i16 elements will have the zero-extend transformed to
 // an 'and' mask by type legalization since neither i8 nor i16 are legal types
 // for AArch64. Match these patterns here since UMOV already zeroes out the high
 // bits of the destination register.
 def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
                (i32 0xff)),
           (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
 def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
                (i32 0xffff)),
           (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
 
 defm INS : SIMDIns;
 
 def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
 def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
 
 def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
 def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
           (SUBREG_TO_REG (i32 0),
                          (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
 
 def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
             (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                                   (i32 FPR32:$Rn), ssub))>;
 def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
             (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
                                   (i32 FPR32:$Rn), ssub))>;
 def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
             (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
                                   (i64 FPR64:$Rn), dsub))>;
 
 def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
 def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
 def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
           (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
 
 def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
             (f16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
           (EXTRACT_SUBREG
             (INSvi16lane
               (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
               VectorIndexS:$imm,
               (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
               (i64 0)),
             dsub)>;
 
 def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
             (f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
           (INSvi16lane
             V128:$Rn, VectorIndexH:$imm,
             (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
             (i64 0))>;
 
 def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
             (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
           (EXTRACT_SUBREG
             (INSvi32lane
               (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
               VectorIndexS:$imm,
               (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
               (i64 0)),
             dsub)>;
 def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
             (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
           (INSvi32lane
             V128:$Rn, VectorIndexS:$imm,
             (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
             (i64 0))>;
 def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
             (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
           (INSvi64lane
             V128:$Rn, VectorIndexD:$imm,
             (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
             (i64 0))>;
 
 // Copy an element at a constant index in one vector into a constant indexed
 // element of another.
 // FIXME refactor to a shared class/dev parameterized on vector type, vector
 // index type and INS extension
 def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
                    (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
                    VectorIndexB:$idx2)),
           (v16i8 (INSvi8lane
                    V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
           )>;
 def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
                    (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
                    VectorIndexH:$idx2)),
           (v8i16 (INSvi16lane
                    V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
           )>;
 def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
                    (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
                    VectorIndexS:$idx2)),
           (v4i32 (INSvi32lane
                    V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
           )>;
 def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
                    (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
                    VectorIndexD:$idx2)),
           (v2i64 (INSvi64lane
                    V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
           )>;
 
 multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
                                 ValueType VTScal, Instruction INS> {
   def : Pat<(VT128 (vector_insert V128:$src,
                         (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
                         imm:$Immd)),
             (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
 
   def : Pat<(VT128 (vector_insert V128:$src,
                         (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
                         imm:$Immd)),
             (INS V128:$src, imm:$Immd,
                  (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
 
   def : Pat<(VT64 (vector_insert V64:$src,
                         (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
                         imm:$Immd)),
             (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
                                  imm:$Immd, V128:$Rn, imm:$Immn),
                             dsub)>;
 
   def : Pat<(VT64 (vector_insert V64:$src,
                         (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
                         imm:$Immd)),
             (EXTRACT_SUBREG
                 (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
                      (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
                 dsub)>;
 }
 
 defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
 defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
 defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
 defm : Neon_INS_elt_pattern<v16i8, v8i8,  i32, INSvi8lane>;
 defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
 defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
 defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>;
 
 
 // Floating point vector extractions are codegen'd as either a sequence of
 // subregister extractions, possibly fed by an INS if the lane number is
 // anything other than zero.
 def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
           (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
 def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
           (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
 def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
           (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
 def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
           (f64 (EXTRACT_SUBREG
             (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
                          V128:$Rn, VectorIndexD:$idx),
             dsub))>;
 def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
           (f32 (EXTRACT_SUBREG
             (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
                          V128:$Rn, VectorIndexS:$idx),
             ssub))>;
 def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
           (f16 (EXTRACT_SUBREG
             (INSvi16lane (v8f16 (IMPLICIT_DEF)), 0,
                          V128:$Rn, VectorIndexH:$idx),
             hsub))>;
 
 // All concat_vectors operations are canonicalised to act on i64 vectors for
 // AArch64. In the general case we need an instruction, which had just as well be
 // INS.
 class ConcatPat<ValueType DstTy, ValueType SrcTy>
   : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
         (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
                      (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
 
 def : ConcatPat<v2i64, v1i64>;
 def : ConcatPat<v2f64, v1f64>;
 def : ConcatPat<v4i32, v2i32>;
 def : ConcatPat<v4f32, v2f32>;
 def : ConcatPat<v8i16, v4i16>;
 def : ConcatPat<v8f16, v4f16>;
 def : ConcatPat<v16i8, v8i8>;
 
 // If the high lanes are undef, though, we can just ignore them:
 class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
   : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
         (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
 
 def : ConcatUndefPat<v2i64, v1i64>;
 def : ConcatUndefPat<v2f64, v1f64>;
 def : ConcatUndefPat<v4i32, v2i32>;
 def : ConcatUndefPat<v4f32, v2f32>;
 def : ConcatUndefPat<v8i16, v4i16>;
 def : ConcatUndefPat<v16i8, v8i8>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD across lanes instructions
 //----------------------------------------------------------------------------
 
 defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
 defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
 defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
 defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
 defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
 defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
 defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
 defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
 defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
 defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
 defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
 
 multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
 // If there is a sign extension after this intrinsic, consume it as smov already
 // performed it
   def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
         (i32 (SMOVvi8to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
           (i64 0)))>;
   def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
         (i32 (SMOVvi8to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
           (i64 0)))>;
 // If there is a sign extension after this intrinsic, consume it as smov already
 // performed it
 def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
         (i32 (SMOVvi8to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
           (i64 0)))>;
 def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
         (i32 (SMOVvi8to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
           (i64 0)))>;
 // If there is a sign extension after this intrinsic, consume it as smov already
 // performed it
 def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
           (i32 (SMOVvi16to32
            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
            (i64 0)))>;
 def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
           (i32 (SMOVvi16to32
            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
            (i64 0)))>;
 // If there is a sign extension after this intrinsic, consume it as smov already
 // performed it
 def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
         (i32 (SMOVvi16to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
           (i64 0)))>;
 def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
         (i32 (SMOVvi16to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
           (i64 0)))>;
 
 def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
           ssub))>;
 }
 
 multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
 // If there is a masking operation keeping only what has been actually
 // generated, consume it.
   def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
           ssub))>;
   def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
           ssub))>;
 // If there is a masking operation keeping only what has been actually
 // generated, consume it.
 def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
           ssub))>;
 def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
           ssub))>;
 
 // If there is a masking operation keeping only what has been actually
 // generated, consume it.
 def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
           (i32 (EXTRACT_SUBREG
             (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
               (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
             ssub))>;
 def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
           (i32 (EXTRACT_SUBREG
             (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
               (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
             ssub))>;
 // If there is a masking operation keeping only what has been actually
 // generated, consume it.
 def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
           ssub))>;
 def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
           ssub))>;
 
 def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
           ssub))>;
 
 }
 
 multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
   def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
         (i32 (SMOVvi16to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
           (i64 0)))>;
 def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
         (i32 (SMOVvi16to32
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
           (i64 0)))>;
 
 def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
           (i32 (EXTRACT_SUBREG
            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
            ssub))>;
 def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
           ssub))>;
 
 def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
         (i64 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
           dsub))>;
 }
 
 multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
                                                 Intrinsic intOp> {
   def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
           ssub))>;
 def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
           ssub))>;
 
 def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
           (i32 (EXTRACT_SUBREG
             (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
               (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
             ssub))>;
 def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
         (i32 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
           ssub))>;
 
 def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
         (i64 (EXTRACT_SUBREG
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
             (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
           dsub))>;
 }
 
 defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_aarch64_neon_saddv>;
 // vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
 def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))),
           (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
 
 defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_aarch64_neon_uaddv>;
 // vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
 def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))),
           (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
 
 defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_aarch64_neon_smaxv>;
 def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))),
            (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
 
 defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_aarch64_neon_sminv>;
 def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))),
            (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
 
 defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_aarch64_neon_umaxv>;
 def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))),
            (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
 
 defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_aarch64_neon_uminv>;
 def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))),
            (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
 
 defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
 defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
 
 // The vaddlv_s32 intrinsic gets mapped to SADDLP.
 def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
           (i64 (EXTRACT_SUBREG
             (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
               (SADDLPv2i32_v1i64 V64:$Rn), dsub),
             dsub))>;
 // The vaddlv_u32 intrinsic gets mapped to UADDLP.
 def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
           (i64 (EXTRACT_SUBREG
             (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
               (UADDLPv2i32_v1i64 V64:$Rn), dsub),
             dsub))>;
 
 //------------------------------------------------------------------------------
 // AdvSIMD modified immediate instructions
 //------------------------------------------------------------------------------
 
 // AdvSIMD BIC
 defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
 // AdvSIMD ORR
 defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
 
 def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
 def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
 def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
 def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
 
 def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
 def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
 
 def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
 def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
 def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
 def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
 
 def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
 def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
 
 // AdvSIMD FMOV
 def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
                                               "fmov", ".2d",
                        [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
 def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
                                               "fmov", ".2s",
                        [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
 def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
                                               "fmov", ".4s",
                        [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
 
 // AdvSIMD MOVI
 
 // EDIT byte mask: scalar
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
                     [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
 // The movi_edit node has the immediate value already encoded, so we use
 // a plain imm0_255 here.
 def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
           (MOVID imm0_255:$shift)>;
 
 def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
 def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
 def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
 def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
 
 def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
 def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
 def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
 def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
 
 // EDIT byte mask: 2d
 
 // The movi_edit node has the immediate value already encoded, so we use
 // a plain imm0_255 in the pattern
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
                                                 simdimmtype10,
                                                 "movi", ".2d",
                    [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
 
 
 // Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
 // Complexity is added to break a tie with a plain MOVI.
 let AddedComplexity = 1 in {
 def : Pat<(f32   fpimm0),
           (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
       Requires<[HasZCZ]>;
 def : Pat<(f64   fpimm0),
           (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
       Requires<[HasZCZ]>;
 }
 
 def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
 def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
 def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
 def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
 
 def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
 
 def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
 def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
 
 // EDIT per word & halfword: 2s, 4h, 4s, & 8h
 defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
 
 def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
 def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
 
 def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
 def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
 
 def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
           (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
 def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
           (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
 def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
           (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
 def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
           (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
 
 // EDIT per word: 2s & 4s with MSL shifter
 def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
                       [(set (v2i32 V64:$Rd),
                             (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
 def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
                       [(set (v4i32 V128:$Rd),
                             (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
 
 // Per byte: 8b & 16b
 def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
                                                  "movi", ".8b",
                        [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
 def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
                                                  "movi", ".16b",
                        [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
 
 // AdvSIMD MVNI
 
 // EDIT per word & halfword: 2s, 4h, 4s, & 8h
 defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
 
 def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
 def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
 
 def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
 def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
 def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
 
 def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
           (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
 def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
           (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
 def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
           (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
 def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
           (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
 
 // EDIT per word: 2s & 4s with MSL shifter
 def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
                       [(set (v2i32 V64:$Rd),
                             (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
 def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
                       [(set (v4i32 V128:$Rd),
                             (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD indexed element
 //----------------------------------------------------------------------------
 
 let hasSideEffects = 0 in {
   defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
   defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
 }
 
 // NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
 // instruction expects the addend first, while the intrinsic expects it last.
 
 // On the other hand, there are quite a few valid combinatorial options due to
 // the commutativity of multiplication and the fact that (-x) * y = x * (-y).
 defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
 defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
            TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
 
 defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
 defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
            TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
 defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
            TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
 defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
            TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
 
 multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
   // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
   // and DUP scalar.
   def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
                            (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
                                            VectorIndexS:$idx))),
             (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
   def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
                            (v2f32 (AArch64duplane32
                                       (v4f32 (insert_subvector undef,
                                                  (v2f32 (fneg V64:$Rm)),
                                                  (i32 0))),
                                       VectorIndexS:$idx)))),
             (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
                                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
                                VectorIndexS:$idx)>;
   def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
                            (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
             (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
                 (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
 
   // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
   // and DUP scalar.
   def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
                            (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
                                            VectorIndexS:$idx))),
             (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
                                VectorIndexS:$idx)>;
   def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
                            (v4f32 (AArch64duplane32
                                       (v4f32 (insert_subvector undef,
                                                  (v2f32 (fneg V64:$Rm)),
                                                  (i32 0))),
                                       VectorIndexS:$idx)))),
             (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
                                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
                                VectorIndexS:$idx)>;
   def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
                            (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
             (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
                 (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
 
   // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
   // (DUPLANE from 64-bit would be trivial).
   def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
                            (AArch64duplane64 (v2f64 (fneg V128:$Rm)),
                                            VectorIndexD:$idx))),
             (FMLSv2i64_indexed
                 V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
   def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
                            (AArch64dup (f64 (fneg FPR64Op:$Rm))))),
             (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
                 (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
 
   // 2 variants for 32-bit scalar version: extract from .2s or from .4s
   def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
                          (vector_extract (v4f32 (fneg V128:$Rm)),
                                          VectorIndexS:$idx))),
             (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
                 V128:$Rm, VectorIndexS:$idx)>;
   def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
                          (vector_extract (v2f32 (fneg V64:$Rm)),
                                          VectorIndexS:$idx))),
             (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
                 (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
 
   // 1 variant for 64-bit scalar version: extract from .1d or from .2d
   def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
                          (vector_extract (v2f64 (fneg V128:$Rm)),
                                          VectorIndexS:$idx))),
             (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
                 V128:$Rm, VectorIndexS:$idx)>;
 }
 
 defm : FMLSIndexedAfterNegPatterns<
            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
 defm : FMLSIndexedAfterNegPatterns<
            TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
 
 defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
 defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
 
 def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
           (FMULv2i32_indexed V64:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
             (i64 0))>;
 def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
           (FMULv4i32_indexed V128:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
             (i64 0))>;
 def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
           (FMULv2i64_indexed V128:$Rn,
             (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
             (i64 0))>;
 
 defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
 defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
               TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
 defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
               TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
 defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
 defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
     TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
 defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
     TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
 defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
                 int_aarch64_neon_smull>;
 defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
                                            int_aarch64_neon_sqadd>;
 defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
                                            int_aarch64_neon_sqsub>;
 defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
 defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
     TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
 defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
     TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
 defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
                 int_aarch64_neon_umull>;
 
 // A scalar sqdmull with the second operand being a vector lane can be
 // handled directly with the indexed instruction encoding.
 def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
                                           (vector_extract (v4i32 V128:$Vm),
                                                            VectorIndexS:$idx)),
           (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD scalar shift instructions
 //----------------------------------------------------------------------------
 defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
 defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
 defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
 defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
 // Codegen patterns for the above. We don't put these directly on the
 // instructions because TableGen's type inference can't handle the truth.
 // Having the same base pattern for fp <--> int totally freaks it out.
 def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
           (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
 def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
           (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
 def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
           (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
 def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
           (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
 def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
                                             vecshiftR64:$imm)),
           (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
 def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
                                             vecshiftR64:$imm)),
           (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
 def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
           (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
 def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
           (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
 def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
           (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
 def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
           (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
 def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
                                             vecshiftR64:$imm)),
           (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
 def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
                                             vecshiftR64:$imm)),
           (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
 
 defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", AArch64vshl>;
 defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
 defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
                                      int_aarch64_neon_sqrshrn>;
 defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
                                      int_aarch64_neon_sqrshrun>;
 defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
 defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
 defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
                                      int_aarch64_neon_sqshrn>;
 defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
                                      int_aarch64_neon_sqshrun>;
 defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
 defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", AArch64srshri>;
 defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
     TriOpFrag<(add node:$LHS,
                    (AArch64srshri node:$MHS, node:$RHS))>>;
 defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", AArch64vashr>;
 defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
     TriOpFrag<(add node:$LHS,
                    (AArch64vashr node:$MHS, node:$RHS))>>;
 defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
                                      int_aarch64_neon_uqrshrn>;
 defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
 defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
                                      int_aarch64_neon_uqshrn>;
 defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", AArch64urshri>;
 defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
     TriOpFrag<(add node:$LHS,
                    (AArch64urshri node:$MHS, node:$RHS))>>;
 defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", AArch64vlshr>;
 defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
     TriOpFrag<(add node:$LHS,
                    (AArch64vlshr node:$MHS, node:$RHS))>>;
 
 //----------------------------------------------------------------------------
 // AdvSIMD vector shift instructions
 //----------------------------------------------------------------------------
 defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
 defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
 defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
                                    int_aarch64_neon_vcvtfxs2fp>;
 defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
                                          int_aarch64_neon_rshrn>;
 defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
 defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
                           BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
 defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
 def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
                                       (i32 vecshiftL64:$imm))),
           (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
 defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
                                          int_aarch64_neon_sqrshrn>;
 defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
                                          int_aarch64_neon_sqrshrun>;
 defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
 defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
 defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
                                          int_aarch64_neon_sqshrn>;
 defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
                                          int_aarch64_neon_sqshrun>;
 defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
 def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
                                       (i32 vecshiftR64:$imm))),
           (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
 defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
 defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
                  TriOpFrag<(add node:$LHS,
                                 (AArch64srshri node:$MHS, node:$RHS))> >;
 defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
                 BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;
 
 defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
 defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
                 TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
 defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
                         int_aarch64_neon_vcvtfxu2fp>;
 defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
                                          int_aarch64_neon_uqrshrn>;
 defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
 defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
                                          int_aarch64_neon_uqshrn>;
 defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
 defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
                 TriOpFrag<(add node:$LHS,
                                (AArch64urshri node:$MHS, node:$RHS))> >;
 defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
                 BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
 defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
 defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
                 TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
 
 // SHRN patterns for when a logical right shift was used instead of arithmetic
 // (the immediate guarantees no sign bits actually end up in the result so it
 // doesn't matter).
 def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
           (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
 def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
           (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
 def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
           (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
 
 def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
                                  (trunc (AArch64vlshr (v8i16 V128:$Rn),
                                                     vecshiftR16Narrow:$imm)))),
           (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
                            V128:$Rn, vecshiftR16Narrow:$imm)>;
 def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
                                  (trunc (AArch64vlshr (v4i32 V128:$Rn),
                                                     vecshiftR32Narrow:$imm)))),
           (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
                            V128:$Rn, vecshiftR32Narrow:$imm)>;
 def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
                                  (trunc (AArch64vlshr (v2i64 V128:$Rn),
                                                     vecshiftR64Narrow:$imm)))),
           (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
                            V128:$Rn, vecshiftR32Narrow:$imm)>;
 
 // Vector sign and zero extensions are implemented with SSHLL and USSHLL.
 // Anyexts are implemented as zexts.
 def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
 def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
 def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
 def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
 def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
 def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
 def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
 def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
 def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
 // Also match an extend from the upper half of a 128 bit source register.
 def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
           (USHLLv16i8_shift V128:$Rn, (i32 0))>;
 def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
           (USHLLv16i8_shift V128:$Rn, (i32 0))>;
 def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
           (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
 def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
           (USHLLv8i16_shift V128:$Rn, (i32 0))>;
 def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
           (USHLLv8i16_shift V128:$Rn, (i32 0))>;
 def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
           (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
 def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
           (USHLLv4i32_shift V128:$Rn, (i32 0))>;
 def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
           (USHLLv4i32_shift V128:$Rn, (i32 0))>;
 def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
           (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
 
 // Vector shift sxtl aliases
 def : InstAlias<"sxtl.8h $dst, $src1",
                 (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
 def : InstAlias<"sxtl $dst.8h, $src1.8b",
                 (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
 def : InstAlias<"sxtl.4s $dst, $src1",
                 (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
 def : InstAlias<"sxtl $dst.4s, $src1.4h",
                 (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
 def : InstAlias<"sxtl.2d $dst, $src1",
                 (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
 def : InstAlias<"sxtl $dst.2d, $src1.2s",
                 (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
 
 // Vector shift sxtl2 aliases
 def : InstAlias<"sxtl2.8h $dst, $src1",
                 (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
 def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
                 (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
 def : InstAlias<"sxtl2.4s $dst, $src1",
                 (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
 def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
                 (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
 def : InstAlias<"sxtl2.2d $dst, $src1",
                 (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
 def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
                 (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
 
 // Vector shift uxtl aliases
 def : InstAlias<"uxtl.8h $dst, $src1",
                 (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
 def : InstAlias<"uxtl $dst.8h, $src1.8b",
                 (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
 def : InstAlias<"uxtl.4s $dst, $src1",
                 (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
 def : InstAlias<"uxtl $dst.4s, $src1.4h",
                 (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
 def : InstAlias<"uxtl.2d $dst, $src1",
                 (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
 def : InstAlias<"uxtl $dst.2d, $src1.2s",
                 (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
 
 // Vector shift uxtl2 aliases
 def : InstAlias<"uxtl2.8h $dst, $src1",
                 (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
 def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
                 (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
 def : InstAlias<"uxtl2.4s $dst, $src1",
                 (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
 def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
                 (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
 def : InstAlias<"uxtl2.2d $dst, $src1",
                 (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
 def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
                 (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
 
 // If an integer is about to be converted to a floating point value,
 // just load it on the floating point unit.
 // These patterns are more complex because floating point loads do not
 // support sign extension.
 // The sign extension has to be explicitly added and is only supported for
 // one step: byte-to-half, half-to-word, word-to-doubleword.
 // SCVTF GPR -> FPR is 9 cycles.
 // SCVTF FPR -> FPR is 4 cyclces.
 // (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
 // Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
 // and still being faster.
 // However, this is not good for code size.
 // 8-bits -> float. 2 sizes step-up.
 class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
   : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
         (SCVTFv1i32 (f32 (EXTRACT_SUBREG
                             (SSHLLv4i16_shift
                               (f64
                                 (EXTRACT_SUBREG
                                   (SSHLLv8i8_shift
                                     (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
                                         INST,
                                         bsub),
                                     0),
                                   dsub)),
                                0),
                              ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
 
 def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
                           (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
 def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
                           (LDRBroX  GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
 def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
                           (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
 def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
                           (LDURBi GPR64sp:$Rn, simm9:$offset)>;
 
 // 16-bits -> float. 1 size step-up.
 class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
   : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
         (SCVTFv1i32 (f32 (EXTRACT_SUBREG
                             (SSHLLv4i16_shift
                                 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
                                   INST,
                                   hsub),
                                 0),
                             ssub)))>, Requires<[NotForCodeSize]>;
 
 def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW   GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
 def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
                            (LDRHroX   GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
 def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
                            (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
 def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
                            (LDURHi GPR64sp:$Rn, simm9:$offset)>;
 
 // 32-bits to 32-bits are handled in target specific dag combine:
 // performIntToFpCombine.
 // 64-bits integer to 32-bits floating point, not possible with
 // SCVTF on floating point registers (both source and destination
 // must have the same size).
 
 // Here are the patterns for 8, 16, 32, and 64-bits to double.
 // 8-bits -> double. 3 size step-up: give up.
 // 16-bits -> double. 2 size step.
 class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
   : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
            (SCVTFv1i64 (f64 (EXTRACT_SUBREG
                               (SSHLLv2i32_shift
                                  (f64
                                   (EXTRACT_SUBREG
                                     (SSHLLv4i16_shift
                                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
                                         INST,
                                         hsub),
                                      0),
                                    dsub)),
                                0),
                              dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
  
 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
 def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
                            (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
 def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
                            (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
 def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
                            (LDURHi GPR64sp:$Rn, simm9:$offset)>;
 // 32-bits -> double. 1 size step-up.
 class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
   : Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
            (SCVTFv1i64 (f64 (EXTRACT_SUBREG
                               (SSHLLv2i32_shift
                                 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
                                   INST,
                                   ssub),
                                0),
                              dsub)))>, Requires<[NotForCodeSize]>;
 
 def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
                            (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
 def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
                            (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
 def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
                            (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
 def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
                            (LDURSi GPR64sp:$Rn, simm9:$offset)>;
 
 // 64-bits -> double are handled in target specific dag combine:
 // performIntToFpCombine.
 
 
 //----------------------------------------------------------------------------
 // AdvSIMD Load-Store Structure
 //----------------------------------------------------------------------------
 defm LD1 : SIMDLd1Multiple<"ld1">;
 defm LD2 : SIMDLd2Multiple<"ld2">;
 defm LD3 : SIMDLd3Multiple<"ld3">;
 defm LD4 : SIMDLd4Multiple<"ld4">;
 
 defm ST1 : SIMDSt1Multiple<"st1">;
 defm ST2 : SIMDSt2Multiple<"st2">;
 defm ST3 : SIMDSt3Multiple<"st3">;
 defm ST4 : SIMDSt4Multiple<"st4">;
 
 class Ld1Pat<ValueType ty, Instruction INST>
   : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
 
 def : Ld1Pat<v16i8, LD1Onev16b>;
 def : Ld1Pat<v8i16, LD1Onev8h>;
 def : Ld1Pat<v4i32, LD1Onev4s>;
 def : Ld1Pat<v2i64, LD1Onev2d>;
 def : Ld1Pat<v8i8,  LD1Onev8b>;
 def : Ld1Pat<v4i16, LD1Onev4h>;
 def : Ld1Pat<v2i32, LD1Onev2s>;
 def : Ld1Pat<v1i64, LD1Onev1d>;
 
 class St1Pat<ValueType ty, Instruction INST>
   : Pat<(store ty:$Vt, GPR64sp:$Rn),
         (INST ty:$Vt, GPR64sp:$Rn)>;
 
 def : St1Pat<v16i8, ST1Onev16b>;
 def : St1Pat<v8i16, ST1Onev8h>;
 def : St1Pat<v4i32, ST1Onev4s>;
 def : St1Pat<v2i64, ST1Onev2d>;
 def : St1Pat<v8i8,  ST1Onev8b>;
 def : St1Pat<v4i16, ST1Onev4h>;
 def : St1Pat<v2i32, ST1Onev2s>;
 def : St1Pat<v1i64, ST1Onev1d>;
 
 //---
 // Single-element
 //---
 
 defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
 defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
 defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
 defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
 let mayLoad = 1, hasSideEffects = 0 in {
 defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
 defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
 defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
 defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
 defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
 defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
 defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
 defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
 defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
 defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
 defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
 defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
 defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
 defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
 defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
 defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
 }
 
 def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
           (LD1Rv8b GPR64sp:$Rn)>;
 def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
           (LD1Rv16b GPR64sp:$Rn)>;
 def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
           (LD1Rv4h GPR64sp:$Rn)>;
 def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
           (LD1Rv8h GPR64sp:$Rn)>;
 def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
           (LD1Rv2s GPR64sp:$Rn)>;
 def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
           (LD1Rv4s GPR64sp:$Rn)>;
 def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
           (LD1Rv2d GPR64sp:$Rn)>;
 def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
           (LD1Rv1d GPR64sp:$Rn)>;
 // Grab the floating point version too
 def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
           (LD1Rv2s GPR64sp:$Rn)>;
 def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
           (LD1Rv4s GPR64sp:$Rn)>;
 def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
           (LD1Rv2d GPR64sp:$Rn)>;
 def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
           (LD1Rv1d GPR64sp:$Rn)>;
 def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
           (LD1Rv4h GPR64sp:$Rn)>;
 def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
           (LD1Rv8h GPR64sp:$Rn)>;
 
 class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
                     ValueType VTy, ValueType STy, Instruction LD1>
   : Pat<(vector_insert (VTy VecListOne128:$Rd),
            (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
         (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
 
 def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
 def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
 def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
 def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
 def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
 def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
 def : Ld1Lane128Pat<load,       VectorIndexH, v8f16, f16, LD1i16>;
 
 class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
                    ValueType VTy, ValueType STy, Instruction LD1>
   : Pat<(vector_insert (VTy VecListOne64:$Rd),
            (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
         (EXTRACT_SUBREG
             (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
                           VecIndex:$idx, GPR64sp:$Rn),
             dsub)>;
 
 def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
 def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
 def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
 def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
 def : Ld1Lane64Pat<load,       VectorIndexH, v4f16, f16, LD1i16>;
 
 
 defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
 defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
 defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
 defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
 
 // Stores
 defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
 defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
 defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
 defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
 
 let AddedComplexity = 19 in
 class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
                     ValueType VTy, ValueType STy, Instruction ST1>
   : Pat<(scalar_store
              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
              GPR64sp:$Rn),
         (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
 
 def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
 def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
 def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
 def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
 def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
 def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
 def : St1Lane128Pat<store,         VectorIndexH, v8f16, f16, ST1i16>;
 
 let AddedComplexity = 19 in
 class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
                    ValueType VTy, ValueType STy, Instruction ST1>
   : Pat<(scalar_store
              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
              GPR64sp:$Rn),
         (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
              VecIndex:$idx, GPR64sp:$Rn)>;
 
 def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
 def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
 def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
 def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
 def : St1Lane64Pat<store,         VectorIndexH, v4f16, f16, ST1i16>;
 
 multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
                              ValueType VTy, ValueType STy, Instruction ST1,
                              int offset> {
   def : Pat<(scalar_store
               (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
               GPR64sp:$Rn, offset),
         (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
              VecIndex:$idx, GPR64sp:$Rn, XZR)>;
 
   def : Pat<(scalar_store
               (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
               GPR64sp:$Rn, GPR64:$Rm),
         (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
              VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
 }
 
 defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
 defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
                         2>;
 defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
 defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
 defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
 defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
 defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;
 
 multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
                              ValueType VTy, ValueType STy, Instruction ST1,
                              int offset> {
   def : Pat<(scalar_store
               (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
               GPR64sp:$Rn, offset),
         (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
 
   def : Pat<(scalar_store
               (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
               GPR64sp:$Rn, GPR64:$Rm),
         (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
 }
 
 defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
                          1>;
 defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
                          2>;
 defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
 defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
 defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
 defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
 defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
 
 let mayStore = 1, hasSideEffects = 0 in {
 defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
 defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
 defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
 defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
 defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
 defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
 defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
 defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
 defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
 defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
 defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
 defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
 }
 
 defm ST1 : SIMDLdSt1SingleAliases<"st1">;
 defm ST2 : SIMDLdSt2SingleAliases<"st2">;
 defm ST3 : SIMDLdSt3SingleAliases<"st3">;
 defm ST4 : SIMDLdSt4SingleAliases<"st4">;
 
 //----------------------------------------------------------------------------
 // Crypto extensions
 //----------------------------------------------------------------------------
 
 def AESErr   : AESTiedInst<0b0100, "aese",   int_aarch64_crypto_aese>;
 def AESDrr   : AESTiedInst<0b0101, "aesd",   int_aarch64_crypto_aesd>;
 def AESMCrr  : AESInst<    0b0110, "aesmc",  int_aarch64_crypto_aesmc>;
 def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;
 
 def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_aarch64_crypto_sha1c>;
 def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_aarch64_crypto_sha1p>;
 def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_aarch64_crypto_sha1m>;
 def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
 def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
 def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
 def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;
 
 def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_aarch64_crypto_sha1h>;
 def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_aarch64_crypto_sha1su1>;
 def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
 
 //----------------------------------------------------------------------------
 // Compiler-pseudos
 //----------------------------------------------------------------------------
 // FIXME: Like for X86, these should go in their own separate .td file.
 
 // Any instruction that defines a 32-bit result leaves the high half of the
 // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
 // be copying from a truncate. But any other 32-bit operation will zero-extend
 // up to 64 bits.
 // FIXME: X86 also checks for CMOV here. Do we need something similar?
 def def32 : PatLeaf<(i32 GPR32:$src), [{
   return N->getOpcode() != ISD::TRUNCATE &&
          N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
          N->getOpcode() != ISD::CopyFromReg;
 }]>;
 
 // In the case of a 32-bit def that is known to implicitly zero-extend,
 // we can use a SUBREG_TO_REG.
 def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
 
 // For an anyext, we don't care what the high bits are, so we can perform an
 // INSERT_SUBREF into an IMPLICIT_DEF.
 def : Pat<(i64 (anyext GPR32:$src)),
           (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
 
 // When we need to explicitly zero-extend, we use an unsigned bitfield move
 // instruction (UBFM) on the enclosing super-reg.
 def : Pat<(i64 (zext GPR32:$src)),
  (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
 
 // To sign extend, we use a signed bitfield move instruction (SBFM) on the
 // containing super-reg.
 def : Pat<(i64 (sext GPR32:$src)),
    (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
 def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
 def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
 def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
 def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
 def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
 def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
 def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
 
 def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
           (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
                               (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
 def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
           (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
                               (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
 
 def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
           (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
                               (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
 def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
           (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
                               (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
 
 def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
           (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
                    (i64 (i64shift_a        imm0_63:$imm)),
                    (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
 
 // sra patterns have an AddedComplexity of 10, so make sure we have a higher
 // AddedComplexity for the following patterns since we want to match sext + sra
 // patterns before we attempt to match a single sra node.
 let AddedComplexity = 20 in {
 // We support all sext + sra combinations which preserve at least one bit of the
 // original value which is to be sign extended. E.g. we support shifts up to
 // bitwidth-1 bits.
 def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
           (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
 def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
           (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
 
 def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
           (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
 def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
           (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
 
 def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
           (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
                    (i64 imm0_31:$imm), 31)>;
 } // AddedComplexity = 20
 
 // To truncate, we can simply extract from a subregister.
 def : Pat<(i32 (trunc GPR64sp:$src)),
           (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
 
 // __builtin_trap() uses the BRK instruction on AArch64.
 def : Pat<(trap), (BRK 1)>;
 
 // Conversions within AdvSIMD types in the same register size are free.
 // But because we need a consistent lane ordering, in big endian many
 // conversions require one or more REV instructions.
 //
 // Consider a simple memory load followed by a bitconvert then a store.
 //   v0 = load v2i32
 //   v1 = BITCAST v2i32 v0 to v4i16
 //        store v4i16 v2
 //
 // In big endian mode every memory access has an implicit byte swap. LDR and
 // STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
 // is, they treat the vector as a sequence of elements to be byte-swapped.
 // The two pairs of instructions are fundamentally incompatible. We've decided
 // to use LD1/ST1 only to simplify compiler implementation.
 //
 // LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
 // the original code sequence:
 //   v0 = load v2i32
 //   v1 = REV v2i32                  (implicit)
 //   v2 = BITCAST v2i32 v1 to v4i16
 //   v3 = REV v4i16 v2               (implicit)
 //        store v4i16 v3
 //
 // But this is now broken - the value stored is different to the value loaded
 // due to lane reordering. To fix this, on every BITCAST we must perform two
 // other REVs:
 //   v0 = load v2i32
 //   v1 = REV v2i32                  (implicit)
 //   v2 = REV v2i32
 //   v3 = BITCAST v2i32 v2 to v4i16
 //   v4 = REV v4i16
 //   v5 = REV v4i16 v4               (implicit)
 //        store v4i16 v5
 //
 // This means an extra two instructions, but actually in most cases the two REV
 // instructions can be combined into one. For example:
 //   (REV64_2s (REV64_4h X)) === (REV32_4h X)
 //
 // There is also no 128-bit REV instruction. This must be synthesized with an
 // EXT instruction.
 //
 // Most bitconverts require some sort of conversion. The only exceptions are:
 //   a) Identity conversions -  vNfX <-> vNiX
 //   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
 //
 
 // Natural vector casts (64 bit)
 def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
 
 def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
 
 def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
 
 def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;
 
 def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
 
 // Natural vector casts (128 bit)
 def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 
 def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8i8  (bitconvert GPR64:$Xn)),
                  (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
                  (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
                  (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
                  (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
                  (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 
 def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
           (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
           (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
           (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
           (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
           (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 }
 def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
           (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
           (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
 
 def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
           (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
 def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
           (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
 def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
           (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
           (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
 def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
                              (v1i64 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
                              (v1i64 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
                              (v1i64 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
                              (v1i64 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
                              (v1i64 (REV64v2i32 FPR64:$src))>;
 }
 def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
                              (v2i32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
                              (v2i32 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))),
                              (v2i32 (REV32v8i8 FPR64:$src))>;
 def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
                              (v2i32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
                              (v2i32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
                              (v2i32 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
                              (v4i16 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
                              (v4i16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
                              (v4i16 (REV16v8i8 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
                              (v4i16 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))),
                              (v4i16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
                              (v4i16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
                              (v4i16 (REV64v4i16 FPR64:$src))>;
 }
 
 let Predicates = [IsLE] in {
 def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))),
                              (v4f16 (REV16v8i8 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
 }
 
 
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))), (v8i8  FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
                              (v8i8 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))),
                              (v8i8 (REV32v8i8 FPR64:$src))>;
 def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))),
                              (v8i8 (REV16v8i8 FPR64:$src))>;
 def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))),
                              (v8i8 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))),
                              (v8i8 (REV32v8i8 FPR64:$src))>;
 def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
                              (v8i8 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))),
                              (v8i8 (REV16v8i8 FPR64:$src))>;
 }
 
 let Predicates = [IsLE] in {
 def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))), (f64   FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
                              (f64 (REV64v2i32 FPR64:$src))>;
 def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))),
                              (f64 (REV64v4i16 FPR64:$src))>;
 def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))),
                              (f64 (REV64v2i32 FPR64:$src))>;
 def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
                              (f64 (REV64v8i8 FPR64:$src))>;
 def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))),
                              (f64 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
                              (v1f64 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
                              (v1f64 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))),
                              (v1f64 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
                              (v1f64 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
                              (v1f64 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
                              (v2f32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
                              (v2f32 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))),
                              (v2f32 (REV32v8i8 FPR64:$src))>;
 def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
                              (v2f32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
                              (v2f32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
                              (v2f32 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
                             (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
 def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
                             (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
                                             (REV64v4i32 FPR128:$src), (i32 8)))>;
 def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
                             (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
                                             (REV64v8i16 FPR128:$src), (i32 8)))>;
 def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
                             (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
                                             (REV64v8i16 FPR128:$src), (i32 8)))>;
 def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
                             (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
 def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
                             (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
                                             (REV64v4i32 FPR128:$src), (i32 8)))>;
 def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
                             (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
                                             (REV64v16i8 FPR128:$src), (i32 8)))>;
 }
 
 let Predicates = [IsLE] in {
 def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))),
                              (v2f64 (EXTv16i8 FPR128:$src,
                                               FPR128:$src, (i32 8)))>;
 def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
                              (v2f64 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
                              (v2f64 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
                              (v2f64 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
                              (v2f64 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
                              (v2f64 (REV64v4i32 FPR128:$src))>;
 }
 def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))),
                              (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
                                     (REV64v4i32 FPR128:$src), (i32 8)))>;
 def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
                              (v4f32 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
                              (v4f32 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
                              (v4f32 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
                              (v4f32 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
                              (v4f32 (REV64v4i32 FPR128:$src))>;
 }
 def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
                              (v2i64 (EXTv16i8 FPR128:$src,
                                               FPR128:$src, (i32 8)))>;
 def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
                              (v2i64 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
                              (v2i64 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
                              (v2i64 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
                              (v2i64 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
                              (v2i64 (REV64v8i16 FPR128:$src))>;
 }
 def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
                              (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
                                               (REV64v4i32 FPR128:$src),
                                               (i32 8)))>;
 def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
                              (v4i32 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
                              (v4i32 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
                              (v4i32 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
                              (v4i32 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
                              (v4i32 (REV32v8i16 FPR128:$src))>;
 }
 def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
                              (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
                                               (REV64v8i16 FPR128:$src),
                                               (i32 8)))>;
 def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
                              (v8i16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
                              (v8i16 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
                              (v8i16 (REV16v16i8 FPR128:$src))>;
 def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
                              (v8i16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
                              (v8i16 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))),
                              (v8i16 (REV32v8i16 FPR128:$src))>;
 }
 
 let Predicates = [IsLE] in {
 def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))),
                              (v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src),
                                               (REV64v8i16 FPR128:$src),
                                               (i32 8)))>;
 def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
                              (v8f16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
                              (v8f16 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
                              (v8f16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
                              (v8f16 (REV16v16i8 FPR128:$src))>;
 def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
                              (v8f16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
                              (v8f16 (REV32v8i16 FPR128:$src))>;
 }
 
 let Predicates = [IsLE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
                              (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
                                               (REV64v16i8 FPR128:$src),
                                               (i32 8)))>;
 def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
                              (v16i8 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
                              (v16i8 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
                              (v16i8 (REV16v16i8 FPR128:$src))>;
 def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
                              (v16i8 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
                              (v16i8 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
                              (v16i8 (REV16v16i8 FPR128:$src))>;
 }
 
 def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
           (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
 def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
           (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
 def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
           (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
 def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
           (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
 
 // A 64-bit subvector insert to the first 128-bit vector position
 // is a subregister copy that needs no instruction.
 def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 
 // Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
 // or v2f32.
 def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
                     (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
            (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
 def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
                      (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
            (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
     // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
     // so we match on v4f32 here, not v2f32. This will also catch adding
     // the low two lanes of a true v4f32 vector.
 def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
                 (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
           (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
 
 // Scalar 64-bit shifts in FPR64 registers.
 def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
           (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
 def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
           (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
 def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
           (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
 def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
           (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
 
 // Tail call return handling. These are all compiler pseudo-instructions,
 // so no encoding information or anything like that.
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
   def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
   def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
 }
 
 def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
           (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
 def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
           (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
           (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 
 include "AArch64InstrAtomics.td"
Index: head/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp	(revision 280864)
+++ head/contrib/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp	(revision 280865)
@@ -1,201 +1,212 @@
 //==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains code to lower AArch64 MachineInstrs to their corresponding
 // MCInst records.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64MCInstLower.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+extern cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration;
+
 AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer)
     : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
 
 MCSymbol *
 AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
   return Printer.getSymbol(MO.getGlobal());
 }
 
 MCSymbol *
 AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
   return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
 }
 
 MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
                                                        MCSymbol *Sym) const {
   // FIXME: We would like an efficient form for this, so we don't have to do a
   // lot of extra uniquing.
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
   if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) {
     if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
       RefKind = MCSymbolRefExpr::VK_GOTPAGE;
     else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
              AArch64II::MO_PAGEOFF)
       RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
     else
       llvm_unreachable("Unexpected target flags with MO_GOT on GV operand");
   } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) {
     if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
       RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
     else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
              AArch64II::MO_PAGEOFF)
       RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
     else
       llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
   } else {
     if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
       RefKind = MCSymbolRefExpr::VK_PAGE;
     else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
              AArch64II::MO_PAGEOFF)
       RefKind = MCSymbolRefExpr::VK_PAGEOFF;
   }
   const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
   if (!MO.isJTI() && MO.getOffset())
     Expr = MCBinaryExpr::CreateAdd(
         Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
   return MCOperand::CreateExpr(Expr);
 }
 
 MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
                                                     MCSymbol *Sym) const {
   uint32_t RefFlags = 0;
 
   if (MO.getTargetFlags() & AArch64II::MO_GOT)
     RefFlags |= AArch64MCExpr::VK_GOT;
   else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
     TLSModel::Model Model;
     if (MO.isGlobal()) {
       const GlobalValue *GV = MO.getGlobal();
       Model = Printer.TM.getTLSModel(GV);
+      if (!EnableAArch64ELFLocalDynamicTLSGeneration &&
+          Model == TLSModel::LocalDynamic)
+        Model = TLSModel::GeneralDynamic;
+
     } else {
       assert(MO.isSymbol() &&
              StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
              "unexpected external TLS symbol");
+      // The general dynamic access sequence is used to get the
+      // address of _TLS_MODULE_BASE_.
       Model = TLSModel::GeneralDynamic;
     }
     switch (Model) {
     case TLSModel::InitialExec:
       RefFlags |= AArch64MCExpr::VK_GOTTPREL;
       break;
     case TLSModel::LocalExec:
       RefFlags |= AArch64MCExpr::VK_TPREL;
       break;
     case TLSModel::LocalDynamic:
       RefFlags |= AArch64MCExpr::VK_DTPREL;
       break;
     case TLSModel::GeneralDynamic:
       RefFlags |= AArch64MCExpr::VK_TLSDESC;
       break;
     }
   } else {
     // No modifier means this is a generic reference, classified as absolute for
     // the cases where it matters (:abs_g0: etc).
     RefFlags |= AArch64MCExpr::VK_ABS;
   }
 
   if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
     RefFlags |= AArch64MCExpr::VK_PAGE;
   else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
            AArch64II::MO_PAGEOFF)
     RefFlags |= AArch64MCExpr::VK_PAGEOFF;
   else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
     RefFlags |= AArch64MCExpr::VK_G3;
   else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2)
     RefFlags |= AArch64MCExpr::VK_G2;
   else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1)
     RefFlags |= AArch64MCExpr::VK_G1;
   else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0)
     RefFlags |= AArch64MCExpr::VK_G0;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_HI12)
+    RefFlags |= AArch64MCExpr::VK_HI12;
 
   if (MO.getTargetFlags() & AArch64II::MO_NC)
     RefFlags |= AArch64MCExpr::VK_NC;
 
   const MCExpr *Expr =
       MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
   if (!MO.isJTI() && MO.getOffset())
     Expr = MCBinaryExpr::CreateAdd(
         Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
 
   AArch64MCExpr::VariantKind RefKind;
   RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags);
   Expr = AArch64MCExpr::Create(Expr, RefKind, Ctx);
 
   return MCOperand::CreateExpr(Expr);
 }
 
 MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                                  MCSymbol *Sym) const {
   if (TargetTriple.isOSDarwin())
     return lowerSymbolOperandDarwin(MO, Sym);
 
   assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
   return lowerSymbolOperandELF(MO, Sym);
 }
 
 bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
                                       MCOperand &MCOp) const {
   switch (MO.getType()) {
   default:
     llvm_unreachable("unknown operand type");
   case MachineOperand::MO_Register:
     // Ignore all implicit register operands.
     if (MO.isImplicit())
       return false;
     MCOp = MCOperand::CreateReg(MO.getReg());
     break;
   case MachineOperand::MO_RegisterMask:
     // Regmasks are like implicit defs.
     return false;
   case MachineOperand::MO_Immediate:
     MCOp = MCOperand::CreateImm(MO.getImm());
     break;
   case MachineOperand::MO_MachineBasicBlock:
     MCOp = MCOperand::CreateExpr(
         MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
     break;
   case MachineOperand::MO_GlobalAddress:
     MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
     break;
   case MachineOperand::MO_ExternalSymbol:
     MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
     break;
   case MachineOperand::MO_JumpTableIndex:
     MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
     break;
   case MachineOperand::MO_ConstantPoolIndex:
     MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
     break;
   case MachineOperand::MO_BlockAddress:
     MCOp = LowerSymbolOperand(
         MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
     break;
   }
   return true;
 }
 
 void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MCOperand MCOp;
     if (lowerOperand(MI->getOperand(i), MCOp))
       OutMI.addOperand(MCOp);
   }
 }
Index: head/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
===================================================================
--- head/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h	(revision 280864)
+++ head/contrib/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h	(revision 280865)
@@ -1,1285 +1,1290 @@
 //===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains small standalone helper functions and enum definitions for
 // the AArch64 target useful for the compiler back-end and the MC libraries.
 // As such, it deliberately does not include references to LLVM core
 // code gen types, passes, etc..
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
 #define LLVM_LIB_TARGET_AARCH64_UTILS_AARCH64BASEINFO_H
 
 // FIXME: Is it easiest to fix this layering violation by moving the .inc
 // #includes from AArch64MCTargetDesc.h to here?
 #include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 
 inline static unsigned getWRegFromXReg(unsigned Reg) {
   switch (Reg) {
   case AArch64::X0: return AArch64::W0;
   case AArch64::X1: return AArch64::W1;
   case AArch64::X2: return AArch64::W2;
   case AArch64::X3: return AArch64::W3;
   case AArch64::X4: return AArch64::W4;
   case AArch64::X5: return AArch64::W5;
   case AArch64::X6: return AArch64::W6;
   case AArch64::X7: return AArch64::W7;
   case AArch64::X8: return AArch64::W8;
   case AArch64::X9: return AArch64::W9;
   case AArch64::X10: return AArch64::W10;
   case AArch64::X11: return AArch64::W11;
   case AArch64::X12: return AArch64::W12;
   case AArch64::X13: return AArch64::W13;
   case AArch64::X14: return AArch64::W14;
   case AArch64::X15: return AArch64::W15;
   case AArch64::X16: return AArch64::W16;
   case AArch64::X17: return AArch64::W17;
   case AArch64::X18: return AArch64::W18;
   case AArch64::X19: return AArch64::W19;
   case AArch64::X20: return AArch64::W20;
   case AArch64::X21: return AArch64::W21;
   case AArch64::X22: return AArch64::W22;
   case AArch64::X23: return AArch64::W23;
   case AArch64::X24: return AArch64::W24;
   case AArch64::X25: return AArch64::W25;
   case AArch64::X26: return AArch64::W26;
   case AArch64::X27: return AArch64::W27;
   case AArch64::X28: return AArch64::W28;
   case AArch64::FP: return AArch64::W29;
   case AArch64::LR: return AArch64::W30;
   case AArch64::SP: return AArch64::WSP;
   case AArch64::XZR: return AArch64::WZR;
   }
   // For anything else, return it unchanged.
   return Reg;
 }
 
 inline static unsigned getXRegFromWReg(unsigned Reg) {
   switch (Reg) {
   case AArch64::W0: return AArch64::X0;
   case AArch64::W1: return AArch64::X1;
   case AArch64::W2: return AArch64::X2;
   case AArch64::W3: return AArch64::X3;
   case AArch64::W4: return AArch64::X4;
   case AArch64::W5: return AArch64::X5;
   case AArch64::W6: return AArch64::X6;
   case AArch64::W7: return AArch64::X7;
   case AArch64::W8: return AArch64::X8;
   case AArch64::W9: return AArch64::X9;
   case AArch64::W10: return AArch64::X10;
   case AArch64::W11: return AArch64::X11;
   case AArch64::W12: return AArch64::X12;
   case AArch64::W13: return AArch64::X13;
   case AArch64::W14: return AArch64::X14;
   case AArch64::W15: return AArch64::X15;
   case AArch64::W16: return AArch64::X16;
   case AArch64::W17: return AArch64::X17;
   case AArch64::W18: return AArch64::X18;
   case AArch64::W19: return AArch64::X19;
   case AArch64::W20: return AArch64::X20;
   case AArch64::W21: return AArch64::X21;
   case AArch64::W22: return AArch64::X22;
   case AArch64::W23: return AArch64::X23;
   case AArch64::W24: return AArch64::X24;
   case AArch64::W25: return AArch64::X25;
   case AArch64::W26: return AArch64::X26;
   case AArch64::W27: return AArch64::X27;
   case AArch64::W28: return AArch64::X28;
   case AArch64::W29: return AArch64::FP;
   case AArch64::W30: return AArch64::LR;
   case AArch64::WSP: return AArch64::SP;
   case AArch64::WZR: return AArch64::XZR;
   }
   // For anything else, return it unchanged.
   return Reg;
 }
 
 static inline unsigned getBRegFromDReg(unsigned Reg) {
   switch (Reg) {
   case AArch64::D0:  return AArch64::B0;
   case AArch64::D1:  return AArch64::B1;
   case AArch64::D2:  return AArch64::B2;
   case AArch64::D3:  return AArch64::B3;
   case AArch64::D4:  return AArch64::B4;
   case AArch64::D5:  return AArch64::B5;
   case AArch64::D6:  return AArch64::B6;
   case AArch64::D7:  return AArch64::B7;
   case AArch64::D8:  return AArch64::B8;
   case AArch64::D9:  return AArch64::B9;
   case AArch64::D10: return AArch64::B10;
   case AArch64::D11: return AArch64::B11;
   case AArch64::D12: return AArch64::B12;
   case AArch64::D13: return AArch64::B13;
   case AArch64::D14: return AArch64::B14;
   case AArch64::D15: return AArch64::B15;
   case AArch64::D16: return AArch64::B16;
   case AArch64::D17: return AArch64::B17;
   case AArch64::D18: return AArch64::B18;
   case AArch64::D19: return AArch64::B19;
   case AArch64::D20: return AArch64::B20;
   case AArch64::D21: return AArch64::B21;
   case AArch64::D22: return AArch64::B22;
   case AArch64::D23: return AArch64::B23;
   case AArch64::D24: return AArch64::B24;
   case AArch64::D25: return AArch64::B25;
   case AArch64::D26: return AArch64::B26;
   case AArch64::D27: return AArch64::B27;
   case AArch64::D28: return AArch64::B28;
   case AArch64::D29: return AArch64::B29;
   case AArch64::D30: return AArch64::B30;
   case AArch64::D31: return AArch64::B31;
   }
   // For anything else, return it unchanged.
   return Reg;
 }
 
 
 static inline unsigned getDRegFromBReg(unsigned Reg) {
   switch (Reg) {
   case AArch64::B0:  return AArch64::D0;
   case AArch64::B1:  return AArch64::D1;
   case AArch64::B2:  return AArch64::D2;
   case AArch64::B3:  return AArch64::D3;
   case AArch64::B4:  return AArch64::D4;
   case AArch64::B5:  return AArch64::D5;
   case AArch64::B6:  return AArch64::D6;
   case AArch64::B7:  return AArch64::D7;
   case AArch64::B8:  return AArch64::D8;
   case AArch64::B9:  return AArch64::D9;
   case AArch64::B10: return AArch64::D10;
   case AArch64::B11: return AArch64::D11;
   case AArch64::B12: return AArch64::D12;
   case AArch64::B13: return AArch64::D13;
   case AArch64::B14: return AArch64::D14;
   case AArch64::B15: return AArch64::D15;
   case AArch64::B16: return AArch64::D16;
   case AArch64::B17: return AArch64::D17;
   case AArch64::B18: return AArch64::D18;
   case AArch64::B19: return AArch64::D19;
   case AArch64::B20: return AArch64::D20;
   case AArch64::B21: return AArch64::D21;
   case AArch64::B22: return AArch64::D22;
   case AArch64::B23: return AArch64::D23;
   case AArch64::B24: return AArch64::D24;
   case AArch64::B25: return AArch64::D25;
   case AArch64::B26: return AArch64::D26;
   case AArch64::B27: return AArch64::D27;
   case AArch64::B28: return AArch64::D28;
   case AArch64::B29: return AArch64::D29;
   case AArch64::B30: return AArch64::D30;
   case AArch64::B31: return AArch64::D31;
   }
   // For anything else, return it unchanged.
   return Reg;
 }
 
 namespace AArch64CC {
 
 // The CondCodes constants map directly to the 4-bit encoding of the condition
 // field for predicated instructions.
 enum CondCode {  // Meaning (integer)          Meaning (floating-point)
   EQ = 0x0,      // Equal                      Equal
   NE = 0x1,      // Not equal                  Not equal, or unordered
   HS = 0x2,      // Unsigned higher or same    >, ==, or unordered
   LO = 0x3,      // Unsigned lower             Less than
   MI = 0x4,      // Minus, negative            Less than
   PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
   VS = 0x6,      // Overflow                   Unordered
   VC = 0x7,      // No overflow                Not unordered
   HI = 0x8,      // Unsigned higher            Greater than, or unordered
   LS = 0x9,      // Unsigned lower or same     Less than or equal
   GE = 0xa,      // Greater than or equal      Greater than or equal
   LT = 0xb,      // Less than                  Less than, or unordered
   GT = 0xc,      // Greater than               Greater than
   LE = 0xd,      // Less than or equal         <, ==, or unordered
   AL = 0xe,      // Always (unconditional)     Always (unconditional)
   NV = 0xf,      // Always (unconditional)     Always (unconditional)
   // Note the NV exists purely to disassemble 0b1111. Execution is "always".
   Invalid
 };
 
 inline static const char *getCondCodeName(CondCode Code) {
   switch (Code) {
   default: llvm_unreachable("Unknown condition code");
   case EQ:  return "eq";
   case NE:  return "ne";
   case HS:  return "hs";
   case LO:  return "lo";
   case MI:  return "mi";
   case PL:  return "pl";
   case VS:  return "vs";
   case VC:  return "vc";
   case HI:  return "hi";
   case LS:  return "ls";
   case GE:  return "ge";
   case LT:  return "lt";
   case GT:  return "gt";
   case LE:  return "le";
   case AL:  return "al";
   case NV:  return "nv";
   }
 }
 
 inline static CondCode getInvertedCondCode(CondCode Code) {
   // To reverse a condition it's necessary to only invert the low bit:
 
   return static_cast<CondCode>(static_cast<unsigned>(Code) ^ 0x1);
 }
 
 /// Given a condition code, return NZCV flags that would satisfy that condition.
 /// The flag bits are in the format expected by the ccmp instructions.
 /// Note that many different flag settings can satisfy a given condition code,
 /// this function just returns one of them.
 inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
   // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
   enum { N = 8, Z = 4, C = 2, V = 1 };
   switch (Code) {
   default: llvm_unreachable("Unknown condition code");
   case EQ: return Z; // Z == 1
   case NE: return 0; // Z == 0
   case HS: return C; // C == 1
   case LO: return 0; // C == 0
   case MI: return N; // N == 1
   case PL: return 0; // N == 0
   case VS: return V; // V == 1
   case VC: return 0; // V == 0
   case HI: return C; // C == 1 && Z == 0
   case LS: return 0; // C == 0 || Z == 1
   case GE: return 0; // N == V
   case LT: return N; // N != V
   case GT: return 0; // Z == 0 && N == V
   case LE: return Z; // Z == 1 || N != V
   }
 }
 } // end namespace AArch64CC
 
 /// Instances of this class can perform bidirectional mapping from random
 /// identifier strings to operand encodings. For example "MSR" takes a named
 /// system-register which must be encoded somehow and decoded for printing. This
 /// central location means that the information for those transformations is not
 /// duplicated and remains in sync.
 ///
 /// FIXME: currently the algorithm is a completely unoptimised linear
 /// search. Obviously this could be improved, but we would probably want to work
 /// out just how often these instructions are emitted before working on it. It
 /// might even be optimal to just reorder the tables for the common instructions
 /// rather than changing the algorithm.
 struct AArch64NamedImmMapper {
   struct Mapping {
     const char *Name;
     uint32_t Value;
   };
 
   template<int N>
   AArch64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
     : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
 
   StringRef toString(uint32_t Value, bool &Valid) const;
   uint32_t fromString(StringRef Name, bool &Valid) const;
 
   /// Many of the instructions allow an alternative assembly form consisting of
   /// a simple immediate. Currently the only valid forms are ranges [0, N) where
   /// N being 0 indicates no immediate syntax-form is allowed.
   bool validImm(uint32_t Value) const;
 protected:
   const Mapping *Pairs;
   size_t NumPairs;
   uint32_t TooBigImm;
 };
 
 namespace AArch64AT {
   enum ATValues {
     Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
     S1E1R = 0x43c0,  // 01  000  0111  1000  000
     S1E2R = 0x63c0,  // 01  100  0111  1000  000
     S1E3R = 0x73c0,  // 01  110  0111  1000  000
     S1E1W = 0x43c1,  // 01  000  0111  1000  001
     S1E2W = 0x63c1,  // 01  100  0111  1000  001
     S1E3W = 0x73c1,  // 01  110  0111  1000  001
     S1E0R = 0x43c2,  // 01  000  0111  1000  010
     S1E0W = 0x43c3,  // 01  000  0111  1000  011
     S12E1R = 0x63c4, // 01  100  0111  1000  100
     S12E1W = 0x63c5, // 01  100  0111  1000  101
     S12E0R = 0x63c6, // 01  100  0111  1000  110
     S12E0W = 0x63c7  // 01  100  0111  1000  111
   };
 
   struct ATMapper : AArch64NamedImmMapper {
     const static Mapping ATPairs[];
 
     ATMapper();
   };
 
 }
 namespace AArch64DB {
   enum DBValues {
     Invalid = -1,
     OSHLD = 0x1,
     OSHST = 0x2,
     OSH =   0x3,
     NSHLD = 0x5,
     NSHST = 0x6,
     NSH =   0x7,
     ISHLD = 0x9,
     ISHST = 0xa,
     ISH =   0xb,
     LD =    0xd,
     ST =    0xe,
     SY =    0xf
   };
 
   struct DBarrierMapper : AArch64NamedImmMapper {
     const static Mapping DBarrierPairs[];
 
     DBarrierMapper();
   };
 }
 
 namespace  AArch64DC {
   enum DCValues {
     Invalid = -1,   // Op1  CRn   CRm   Op2
     ZVA   = 0x5ba1, // 01  011  0111  0100  001
     IVAC  = 0x43b1, // 01  000  0111  0110  001
     ISW   = 0x43b2, // 01  000  0111  0110  010
     CVAC  = 0x5bd1, // 01  011  0111  1010  001
     CSW   = 0x43d2, // 01  000  0111  1010  010
     CVAU  = 0x5bd9, // 01  011  0111  1011  001
     CIVAC = 0x5bf1, // 01  011  0111  1110  001
     CISW  = 0x43f2  // 01  000  0111  1110  010
   };
 
   struct DCMapper : AArch64NamedImmMapper {
     const static Mapping DCPairs[];
 
     DCMapper();
   };
 
 }
 
 namespace  AArch64IC {
   enum ICValues {
     Invalid = -1,     // Op1  CRn   CRm   Op2
     IALLUIS = 0x0388, // 000  0111  0001  000
     IALLU = 0x03a8,   // 000  0111  0101  000
     IVAU = 0x1ba9     // 011  0111  0101  001
   };
 
 
   struct ICMapper : AArch64NamedImmMapper {
     const static Mapping ICPairs[];
 
     ICMapper();
   };
 
   static inline bool NeedsRegister(ICValues Val) {
     return Val == IVAU;
   }
 }
 
 namespace  AArch64ISB {
   enum ISBValues {
     Invalid = -1,
     SY = 0xf
   };
   struct ISBMapper : AArch64NamedImmMapper {
     const static Mapping ISBPairs[];
 
     ISBMapper();
   };
 }
 
 namespace AArch64PRFM {
   enum PRFMValues {
     Invalid = -1,
     PLDL1KEEP = 0x00,
     PLDL1STRM = 0x01,
     PLDL2KEEP = 0x02,
     PLDL2STRM = 0x03,
     PLDL3KEEP = 0x04,
     PLDL3STRM = 0x05,
     PLIL1KEEP = 0x08,
     PLIL1STRM = 0x09,
     PLIL2KEEP = 0x0a,
     PLIL2STRM = 0x0b,
     PLIL3KEEP = 0x0c,
     PLIL3STRM = 0x0d,
     PSTL1KEEP = 0x10,
     PSTL1STRM = 0x11,
     PSTL2KEEP = 0x12,
     PSTL2STRM = 0x13,
     PSTL3KEEP = 0x14,
     PSTL3STRM = 0x15
   };
 
   struct PRFMMapper : AArch64NamedImmMapper {
     const static Mapping PRFMPairs[];
 
     PRFMMapper();
   };
 }
 
 namespace AArch64PState {
   enum PStateValues {
     Invalid = -1,
     SPSel = 0x05,
     DAIFSet = 0x1e,
     DAIFClr = 0x1f
   };
 
   struct PStateMapper : AArch64NamedImmMapper {
     const static Mapping PStatePairs[];
 
     PStateMapper();
   };
 
 }
 
 namespace AArch64SE {
     enum ShiftExtSpecifiers {
         Invalid = -1,
         LSL,
         MSL,
         LSR,
         ASR,
         ROR,
 
         UXTB,
         UXTH,
         UXTW,
         UXTX,
 
         SXTB,
         SXTH,
         SXTW,
         SXTX
     };
 }
 
 namespace AArch64Layout {
     enum VectorLayout {
         Invalid = -1,
         VL_8B,
         VL_4H,
         VL_2S,
         VL_1D,
 
         VL_16B,
         VL_8H,
         VL_4S,
         VL_2D,
 
         // Bare layout for the 128-bit vector
         // (only show ".b", ".h", ".s", ".d" without vector number)
         VL_B,
         VL_H,
         VL_S,
         VL_D
     };
 }
 
 inline static const char *
 AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) {
   switch (Layout) {
   case AArch64Layout::VL_8B:  return ".8b";
   case AArch64Layout::VL_4H:  return ".4h";
   case AArch64Layout::VL_2S:  return ".2s";
   case AArch64Layout::VL_1D:  return ".1d";
   case AArch64Layout::VL_16B:  return ".16b";
   case AArch64Layout::VL_8H:  return ".8h";
   case AArch64Layout::VL_4S:  return ".4s";
   case AArch64Layout::VL_2D:  return ".2d";
   case AArch64Layout::VL_B:  return ".b";
   case AArch64Layout::VL_H:  return ".h";
   case AArch64Layout::VL_S:  return ".s";
   case AArch64Layout::VL_D:  return ".d";
   default: llvm_unreachable("Unknown Vector Layout");
   }
 }
 
 inline static AArch64Layout::VectorLayout
 AArch64StringToVectorLayout(StringRef LayoutStr) {
   return StringSwitch<AArch64Layout::VectorLayout>(LayoutStr)
              .Case(".8b", AArch64Layout::VL_8B)
              .Case(".4h", AArch64Layout::VL_4H)
              .Case(".2s", AArch64Layout::VL_2S)
              .Case(".1d", AArch64Layout::VL_1D)
              .Case(".16b", AArch64Layout::VL_16B)
              .Case(".8h", AArch64Layout::VL_8H)
              .Case(".4s", AArch64Layout::VL_4S)
              .Case(".2d", AArch64Layout::VL_2D)
              .Case(".b", AArch64Layout::VL_B)
              .Case(".h", AArch64Layout::VL_H)
              .Case(".s", AArch64Layout::VL_S)
              .Case(".d", AArch64Layout::VL_D)
              .Default(AArch64Layout::Invalid);
 }
 
 namespace AArch64SysReg {
   enum SysRegROValues {
     MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
     DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
     MDRAR_EL1         = 0x8080, // 10  000  0001  0000  000
     OSLSR_EL1         = 0x808c, // 10  000  0001  0001  100
     DBGAUTHSTATUS_EL1 = 0x83f6, // 10  000  0111  1110  110
     PMCEID0_EL0       = 0xdce6, // 11  011  1001  1100  110
     PMCEID1_EL0       = 0xdce7, // 11  011  1001  1100  111
     MIDR_EL1          = 0xc000, // 11  000  0000  0000  000
     CCSIDR_EL1        = 0xc800, // 11  001  0000  0000  000
     CLIDR_EL1         = 0xc801, // 11  001  0000  0000  001
     CTR_EL0           = 0xd801, // 11  011  0000  0000  001
     MPIDR_EL1         = 0xc005, // 11  000  0000  0000  101
     REVIDR_EL1        = 0xc006, // 11  000  0000  0000  110
     AIDR_EL1          = 0xc807, // 11  001  0000  0000  111
     DCZID_EL0         = 0xd807, // 11  011  0000  0000  111
     ID_PFR0_EL1       = 0xc008, // 11  000  0000  0001  000
     ID_PFR1_EL1       = 0xc009, // 11  000  0000  0001  001
     ID_DFR0_EL1       = 0xc00a, // 11  000  0000  0001  010
     ID_AFR0_EL1       = 0xc00b, // 11  000  0000  0001  011
     ID_MMFR0_EL1      = 0xc00c, // 11  000  0000  0001  100
     ID_MMFR1_EL1      = 0xc00d, // 11  000  0000  0001  101
     ID_MMFR2_EL1      = 0xc00e, // 11  000  0000  0001  110
     ID_MMFR3_EL1      = 0xc00f, // 11  000  0000  0001  111
     ID_ISAR0_EL1      = 0xc010, // 11  000  0000  0010  000
     ID_ISAR1_EL1      = 0xc011, // 11  000  0000  0010  001
     ID_ISAR2_EL1      = 0xc012, // 11  000  0000  0010  010
     ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
     ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
     ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
     ID_A64PFR0_EL1    = 0xc020, // 11  000  0000  0100  000
     ID_A64PFR1_EL1    = 0xc021, // 11  000  0000  0100  001
     ID_A64DFR0_EL1    = 0xc028, // 11  000  0000  0101  000
     ID_A64DFR1_EL1    = 0xc029, // 11  000  0000  0101  001
     ID_A64AFR0_EL1    = 0xc02c, // 11  000  0000  0101  100
     ID_A64AFR1_EL1    = 0xc02d, // 11  000  0000  0101  101
     ID_A64ISAR0_EL1   = 0xc030, // 11  000  0000  0110  000
     ID_A64ISAR1_EL1   = 0xc031, // 11  000  0000  0110  001
     ID_A64MMFR0_EL1   = 0xc038, // 11  000  0000  0111  000
     ID_A64MMFR1_EL1   = 0xc039, // 11  000  0000  0111  001
     MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
     MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
     MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
     RVBAR_EL1         = 0xc601, // 11  000  1100  0000  001
     RVBAR_EL2         = 0xe601, // 11  100  1100  0000  001
     RVBAR_EL3         = 0xf601, // 11  110  1100  0000  001
     ISR_EL1           = 0xc608, // 11  000  1100  0001  000
     CNTPCT_EL0        = 0xdf01, // 11  011  1110  0000  001
     CNTVCT_EL0        = 0xdf02,  // 11  011  1110  0000  010
 
     // Trace registers
     TRCSTATR          = 0x8818, // 10  001  0000  0011  000
     TRCIDR8           = 0x8806, // 10  001  0000  0000  110
     TRCIDR9           = 0x880e, // 10  001  0000  0001  110
     TRCIDR10          = 0x8816, // 10  001  0000  0010  110
     TRCIDR11          = 0x881e, // 10  001  0000  0011  110
     TRCIDR12          = 0x8826, // 10  001  0000  0100  110
     TRCIDR13          = 0x882e, // 10  001  0000  0101  110
     TRCIDR0           = 0x8847, // 10  001  0000  1000  111
     TRCIDR1           = 0x884f, // 10  001  0000  1001  111
     TRCIDR2           = 0x8857, // 10  001  0000  1010  111
     TRCIDR3           = 0x885f, // 10  001  0000  1011  111
     TRCIDR4           = 0x8867, // 10  001  0000  1100  111
     TRCIDR5           = 0x886f, // 10  001  0000  1101  111
     TRCIDR6           = 0x8877, // 10  001  0000  1110  111
     TRCIDR7           = 0x887f, // 10  001  0000  1111  111
     TRCOSLSR          = 0x888c, // 10  001  0001  0001  100
     TRCPDSR           = 0x88ac, // 10  001  0001  0101  100
     TRCDEVAFF0        = 0x8bd6, // 10  001  0111  1010  110
     TRCDEVAFF1        = 0x8bde, // 10  001  0111  1011  110
     TRCLSR            = 0x8bee, // 10  001  0111  1101  110
     TRCAUTHSTATUS     = 0x8bf6, // 10  001  0111  1110  110
     TRCDEVARCH        = 0x8bfe, // 10  001  0111  1111  110
     TRCDEVID          = 0x8b97, // 10  001  0111  0010  111
     TRCDEVTYPE        = 0x8b9f, // 10  001  0111  0011  111
     TRCPIDR4          = 0x8ba7, // 10  001  0111  0100  111
     TRCPIDR5          = 0x8baf, // 10  001  0111  0101  111
     TRCPIDR6          = 0x8bb7, // 10  001  0111  0110  111
     TRCPIDR7          = 0x8bbf, // 10  001  0111  0111  111
     TRCPIDR0          = 0x8bc7, // 10  001  0111  1000  111
     TRCPIDR1          = 0x8bcf, // 10  001  0111  1001  111
     TRCPIDR2          = 0x8bd7, // 10  001  0111  1010  111
     TRCPIDR3          = 0x8bdf, // 10  001  0111  1011  111
     TRCCIDR0          = 0x8be7, // 10  001  0111  1100  111
     TRCCIDR1          = 0x8bef, // 10  001  0111  1101  111
     TRCCIDR2          = 0x8bf7, // 10  001  0111  1110  111
     TRCCIDR3          = 0x8bff, // 10  001  0111  1111  111
 
     // GICv3 registers
     ICC_IAR1_EL1      = 0xc660, // 11  000  1100  1100  000
     ICC_IAR0_EL1      = 0xc640, // 11  000  1100  1000  000
     ICC_HPPIR1_EL1    = 0xc662, // 11  000  1100  1100  010
     ICC_HPPIR0_EL1    = 0xc642, // 11  000  1100  1000  010
     ICC_RPR_EL1       = 0xc65b, // 11  000  1100  1011  011
     ICH_VTR_EL2       = 0xe659, // 11  100  1100  1011  001
     ICH_EISR_EL2      = 0xe65b, // 11  100  1100  1011  011
     ICH_ELSR_EL2      = 0xe65d  // 11  100  1100  1011  101
   };
 
   enum SysRegWOValues {
     DBGDTRTX_EL0      = 0x9828, // 10  011  0000  0101  000
     OSLAR_EL1         = 0x8084, // 10  000  0001  0000  100
     PMSWINC_EL0       = 0xdce4,  // 11  011  1001  1100  100
 
     // Trace Registers
     TRCOSLAR          = 0x8884, // 10  001  0001  0000  100
     TRCLAR            = 0x8be6, // 10  001  0111  1100  110
 
     // GICv3 registers
     ICC_EOIR1_EL1     = 0xc661, // 11  000  1100  1100  001
     ICC_EOIR0_EL1     = 0xc641, // 11  000  1100  1000  001
     ICC_DIR_EL1       = 0xc659, // 11  000  1100  1011  001
     ICC_SGI1R_EL1     = 0xc65d, // 11  000  1100  1011  101
     ICC_ASGI1R_EL1    = 0xc65e, // 11  000  1100  1011  110
     ICC_SGI0R_EL1     = 0xc65f  // 11  000  1100  1011  111
   };
 
   enum SysRegValues {
     Invalid = -1,               // Op0 Op1  CRn   CRm   Op2
     OSDTRRX_EL1       = 0x8002, // 10  000  0000  0000  010
     OSDTRTX_EL1       = 0x801a, // 10  000  0000  0011  010
     TEECR32_EL1       = 0x9000, // 10  010  0000  0000  000
     MDCCINT_EL1       = 0x8010, // 10  000  0000  0010  000
     MDSCR_EL1         = 0x8012, // 10  000  0000  0010  010
     DBGDTR_EL0        = 0x9820, // 10  011  0000  0100  000
     OSECCR_EL1        = 0x8032, // 10  000  0000  0110  010
     DBGVCR32_EL2      = 0xa038, // 10  100  0000  0111  000
     DBGBVR0_EL1       = 0x8004, // 10  000  0000  0000  100
     DBGBVR1_EL1       = 0x800c, // 10  000  0000  0001  100
     DBGBVR2_EL1       = 0x8014, // 10  000  0000  0010  100
     DBGBVR3_EL1       = 0x801c, // 10  000  0000  0011  100
     DBGBVR4_EL1       = 0x8024, // 10  000  0000  0100  100
     DBGBVR5_EL1       = 0x802c, // 10  000  0000  0101  100
     DBGBVR6_EL1       = 0x8034, // 10  000  0000  0110  100
     DBGBVR7_EL1       = 0x803c, // 10  000  0000  0111  100
     DBGBVR8_EL1       = 0x8044, // 10  000  0000  1000  100
     DBGBVR9_EL1       = 0x804c, // 10  000  0000  1001  100
     DBGBVR10_EL1      = 0x8054, // 10  000  0000  1010  100
     DBGBVR11_EL1      = 0x805c, // 10  000  0000  1011  100
     DBGBVR12_EL1      = 0x8064, // 10  000  0000  1100  100
     DBGBVR13_EL1      = 0x806c, // 10  000  0000  1101  100
     DBGBVR14_EL1      = 0x8074, // 10  000  0000  1110  100
     DBGBVR15_EL1      = 0x807c, // 10  000  0000  1111  100
     DBGBCR0_EL1       = 0x8005, // 10  000  0000  0000  101
     DBGBCR1_EL1       = 0x800d, // 10  000  0000  0001  101
     DBGBCR2_EL1       = 0x8015, // 10  000  0000  0010  101
     DBGBCR3_EL1       = 0x801d, // 10  000  0000  0011  101
     DBGBCR4_EL1       = 0x8025, // 10  000  0000  0100  101
     DBGBCR5_EL1       = 0x802d, // 10  000  0000  0101  101
     DBGBCR6_EL1       = 0x8035, // 10  000  0000  0110  101
     DBGBCR7_EL1       = 0x803d, // 10  000  0000  0111  101
     DBGBCR8_EL1       = 0x8045, // 10  000  0000  1000  101
     DBGBCR9_EL1       = 0x804d, // 10  000  0000  1001  101
     DBGBCR10_EL1      = 0x8055, // 10  000  0000  1010  101
     DBGBCR11_EL1      = 0x805d, // 10  000  0000  1011  101
     DBGBCR12_EL1      = 0x8065, // 10  000  0000  1100  101
     DBGBCR13_EL1      = 0x806d, // 10  000  0000  1101  101
     DBGBCR14_EL1      = 0x8075, // 10  000  0000  1110  101
     DBGBCR15_EL1      = 0x807d, // 10  000  0000  1111  101
     DBGWVR0_EL1       = 0x8006, // 10  000  0000  0000  110
     DBGWVR1_EL1       = 0x800e, // 10  000  0000  0001  110
     DBGWVR2_EL1       = 0x8016, // 10  000  0000  0010  110
     DBGWVR3_EL1       = 0x801e, // 10  000  0000  0011  110
     DBGWVR4_EL1       = 0x8026, // 10  000  0000  0100  110
     DBGWVR5_EL1       = 0x802e, // 10  000  0000  0101  110
     DBGWVR6_EL1       = 0x8036, // 10  000  0000  0110  110
     DBGWVR7_EL1       = 0x803e, // 10  000  0000  0111  110
     DBGWVR8_EL1       = 0x8046, // 10  000  0000  1000  110
     DBGWVR9_EL1       = 0x804e, // 10  000  0000  1001  110
     DBGWVR10_EL1      = 0x8056, // 10  000  0000  1010  110
     DBGWVR11_EL1      = 0x805e, // 10  000  0000  1011  110
     DBGWVR12_EL1      = 0x8066, // 10  000  0000  1100  110
     DBGWVR13_EL1      = 0x806e, // 10  000  0000  1101  110
     DBGWVR14_EL1      = 0x8076, // 10  000  0000  1110  110
     DBGWVR15_EL1      = 0x807e, // 10  000  0000  1111  110
     DBGWCR0_EL1       = 0x8007, // 10  000  0000  0000  111
     DBGWCR1_EL1       = 0x800f, // 10  000  0000  0001  111
     DBGWCR2_EL1       = 0x8017, // 10  000  0000  0010  111
     DBGWCR3_EL1       = 0x801f, // 10  000  0000  0011  111
     DBGWCR4_EL1       = 0x8027, // 10  000  0000  0100  111
     DBGWCR5_EL1       = 0x802f, // 10  000  0000  0101  111
     DBGWCR6_EL1       = 0x8037, // 10  000  0000  0110  111
     DBGWCR7_EL1       = 0x803f, // 10  000  0000  0111  111
     DBGWCR8_EL1       = 0x8047, // 10  000  0000  1000  111
     DBGWCR9_EL1       = 0x804f, // 10  000  0000  1001  111
     DBGWCR10_EL1      = 0x8057, // 10  000  0000  1010  111
     DBGWCR11_EL1      = 0x805f, // 10  000  0000  1011  111
     DBGWCR12_EL1      = 0x8067, // 10  000  0000  1100  111
     DBGWCR13_EL1      = 0x806f, // 10  000  0000  1101  111
     DBGWCR14_EL1      = 0x8077, // 10  000  0000  1110  111
     DBGWCR15_EL1      = 0x807f, // 10  000  0000  1111  111
     TEEHBR32_EL1      = 0x9080, // 10  010  0001  0000  000
     OSDLR_EL1         = 0x809c, // 10  000  0001  0011  100
     DBGPRCR_EL1       = 0x80a4, // 10  000  0001  0100  100
     DBGCLAIMSET_EL1   = 0x83c6, // 10  000  0111  1000  110
     DBGCLAIMCLR_EL1   = 0x83ce, // 10  000  0111  1001  110
     CSSELR_EL1        = 0xd000, // 11  010  0000  0000  000
     VPIDR_EL2         = 0xe000, // 11  100  0000  0000  000
     VMPIDR_EL2        = 0xe005, // 11  100  0000  0000  101
     CPACR_EL1         = 0xc082, // 11  000  0001  0000  010
     SCTLR_EL1         = 0xc080, // 11  000  0001  0000  000
     SCTLR_EL2         = 0xe080, // 11  100  0001  0000  000
     SCTLR_EL3         = 0xf080, // 11  110  0001  0000  000
     ACTLR_EL1         = 0xc081, // 11  000  0001  0000  001
     ACTLR_EL2         = 0xe081, // 11  100  0001  0000  001
     ACTLR_EL3         = 0xf081, // 11  110  0001  0000  001
     HCR_EL2           = 0xe088, // 11  100  0001  0001  000
     SCR_EL3           = 0xf088, // 11  110  0001  0001  000
     MDCR_EL2          = 0xe089, // 11  100  0001  0001  001
     SDER32_EL3        = 0xf089, // 11  110  0001  0001  001
     CPTR_EL2          = 0xe08a, // 11  100  0001  0001  010
     CPTR_EL3          = 0xf08a, // 11  110  0001  0001  010
     HSTR_EL2          = 0xe08b, // 11  100  0001  0001  011
     HACR_EL2          = 0xe08f, // 11  100  0001  0001  111
     MDCR_EL3          = 0xf099, // 11  110  0001  0011  001
     TTBR0_EL1         = 0xc100, // 11  000  0010  0000  000
     TTBR0_EL2         = 0xe100, // 11  100  0010  0000  000
     TTBR0_EL3         = 0xf100, // 11  110  0010  0000  000
     TTBR1_EL1         = 0xc101, // 11  000  0010  0000  001
     TCR_EL1           = 0xc102, // 11  000  0010  0000  010
     TCR_EL2           = 0xe102, // 11  100  0010  0000  010
     TCR_EL3           = 0xf102, // 11  110  0010  0000  010
     VTTBR_EL2         = 0xe108, // 11  100  0010  0001  000
     VTCR_EL2          = 0xe10a, // 11  100  0010  0001  010
     DACR32_EL2        = 0xe180, // 11  100  0011  0000  000
     SPSR_EL1          = 0xc200, // 11  000  0100  0000  000
     SPSR_EL2          = 0xe200, // 11  100  0100  0000  000
     SPSR_EL3          = 0xf200, // 11  110  0100  0000  000
     ELR_EL1           = 0xc201, // 11  000  0100  0000  001
     ELR_EL2           = 0xe201, // 11  100  0100  0000  001
     ELR_EL3           = 0xf201, // 11  110  0100  0000  001
     SP_EL0            = 0xc208, // 11  000  0100  0001  000
     SP_EL1            = 0xe208, // 11  100  0100  0001  000
     SP_EL2            = 0xf208, // 11  110  0100  0001  000
     SPSel             = 0xc210, // 11  000  0100  0010  000
     NZCV              = 0xda10, // 11  011  0100  0010  000
     DAIF              = 0xda11, // 11  011  0100  0010  001
     CurrentEL         = 0xc212, // 11  000  0100  0010  010
     SPSR_irq          = 0xe218, // 11  100  0100  0011  000
     SPSR_abt          = 0xe219, // 11  100  0100  0011  001
     SPSR_und          = 0xe21a, // 11  100  0100  0011  010
     SPSR_fiq          = 0xe21b, // 11  100  0100  0011  011
     FPCR              = 0xda20, // 11  011  0100  0100  000
     FPSR              = 0xda21, // 11  011  0100  0100  001
     DSPSR_EL0         = 0xda28, // 11  011  0100  0101  000
     DLR_EL0           = 0xda29, // 11  011  0100  0101  001
     IFSR32_EL2        = 0xe281, // 11  100  0101  0000  001
     AFSR0_EL1         = 0xc288, // 11  000  0101  0001  000
     AFSR0_EL2         = 0xe288, // 11  100  0101  0001  000
     AFSR0_EL3         = 0xf288, // 11  110  0101  0001  000
     AFSR1_EL1         = 0xc289, // 11  000  0101  0001  001
     AFSR1_EL2         = 0xe289, // 11  100  0101  0001  001
     AFSR1_EL3         = 0xf289, // 11  110  0101  0001  001
     ESR_EL1           = 0xc290, // 11  000  0101  0010  000
     ESR_EL2           = 0xe290, // 11  100  0101  0010  000
     ESR_EL3           = 0xf290, // 11  110  0101  0010  000
     FPEXC32_EL2       = 0xe298, // 11  100  0101  0011  000
     FAR_EL1           = 0xc300, // 11  000  0110  0000  000
     FAR_EL2           = 0xe300, // 11  100  0110  0000  000
     FAR_EL3           = 0xf300, // 11  110  0110  0000  000
     HPFAR_EL2         = 0xe304, // 11  100  0110  0000  100
     PAR_EL1           = 0xc3a0, // 11  000  0111  0100  000
     PMCR_EL0          = 0xdce0, // 11  011  1001  1100  000
     PMCNTENSET_EL0    = 0xdce1, // 11  011  1001  1100  001
     PMCNTENCLR_EL0    = 0xdce2, // 11  011  1001  1100  010
     PMOVSCLR_EL0      = 0xdce3, // 11  011  1001  1100  011
     PMSELR_EL0        = 0xdce5, // 11  011  1001  1100  101
     PMCCNTR_EL0       = 0xdce8, // 11  011  1001  1101  000
     PMXEVTYPER_EL0    = 0xdce9, // 11  011  1001  1101  001
     PMXEVCNTR_EL0     = 0xdcea, // 11  011  1001  1101  010
     PMUSERENR_EL0     = 0xdcf0, // 11  011  1001  1110  000
     PMINTENSET_EL1    = 0xc4f1, // 11  000  1001  1110  001
     PMINTENCLR_EL1    = 0xc4f2, // 11  000  1001  1110  010
     PMOVSSET_EL0      = 0xdcf3, // 11  011  1001  1110  011
     MAIR_EL1          = 0xc510, // 11  000  1010  0010  000
     MAIR_EL2          = 0xe510, // 11  100  1010  0010  000
     MAIR_EL3          = 0xf510, // 11  110  1010  0010  000
     AMAIR_EL1         = 0xc518, // 11  000  1010  0011  000
     AMAIR_EL2         = 0xe518, // 11  100  1010  0011  000
     AMAIR_EL3         = 0xf518, // 11  110  1010  0011  000
     VBAR_EL1          = 0xc600, // 11  000  1100  0000  000
     VBAR_EL2          = 0xe600, // 11  100  1100  0000  000
     VBAR_EL3          = 0xf600, // 11  110  1100  0000  000
     RMR_EL1           = 0xc602, // 11  000  1100  0000  010
     RMR_EL2           = 0xe602, // 11  100  1100  0000  010
     RMR_EL3           = 0xf602, // 11  110  1100  0000  010
     CONTEXTIDR_EL1    = 0xc681, // 11  000  1101  0000  001
     TPIDR_EL0         = 0xde82, // 11  011  1101  0000  010
     TPIDR_EL2         = 0xe682, // 11  100  1101  0000  010
     TPIDR_EL3         = 0xf682, // 11  110  1101  0000  010
     TPIDRRO_EL0       = 0xde83, // 11  011  1101  0000  011
     TPIDR_EL1         = 0xc684, // 11  000  1101  0000  100
     CNTFRQ_EL0        = 0xdf00, // 11  011  1110  0000  000
     CNTVOFF_EL2       = 0xe703, // 11  100  1110  0000  011
     CNTKCTL_EL1       = 0xc708, // 11  000  1110  0001  000
     CNTHCTL_EL2       = 0xe708, // 11  100  1110  0001  000
     CNTP_TVAL_EL0     = 0xdf10, // 11  011  1110  0010  000
     CNTHP_TVAL_EL2    = 0xe710, // 11  100  1110  0010  000
     CNTPS_TVAL_EL1    = 0xff10, // 11  111  1110  0010  000
     CNTP_CTL_EL0      = 0xdf11, // 11  011  1110  0010  001
     CNTHP_CTL_EL2     = 0xe711, // 11  100  1110  0010  001
     CNTPS_CTL_EL1     = 0xff11, // 11  111  1110  0010  001
     CNTP_CVAL_EL0     = 0xdf12, // 11  011  1110  0010  010
     CNTHP_CVAL_EL2    = 0xe712, // 11  100  1110  0010  010
     CNTPS_CVAL_EL1    = 0xff12, // 11  111  1110  0010  010
     CNTV_TVAL_EL0     = 0xdf18, // 11  011  1110  0011  000
     CNTV_CTL_EL0      = 0xdf19, // 11  011  1110  0011  001
     CNTV_CVAL_EL0     = 0xdf1a, // 11  011  1110  0011  010
     PMEVCNTR0_EL0     = 0xdf40, // 11  011  1110  1000  000
     PMEVCNTR1_EL0     = 0xdf41, // 11  011  1110  1000  001
     PMEVCNTR2_EL0     = 0xdf42, // 11  011  1110  1000  010
     PMEVCNTR3_EL0     = 0xdf43, // 11  011  1110  1000  011
     PMEVCNTR4_EL0     = 0xdf44, // 11  011  1110  1000  100
     PMEVCNTR5_EL0     = 0xdf45, // 11  011  1110  1000  101
     PMEVCNTR6_EL0     = 0xdf46, // 11  011  1110  1000  110
     PMEVCNTR7_EL0     = 0xdf47, // 11  011  1110  1000  111
     PMEVCNTR8_EL0     = 0xdf48, // 11  011  1110  1001  000
     PMEVCNTR9_EL0     = 0xdf49, // 11  011  1110  1001  001
     PMEVCNTR10_EL0    = 0xdf4a, // 11  011  1110  1001  010
     PMEVCNTR11_EL0    = 0xdf4b, // 11  011  1110  1001  011
     PMEVCNTR12_EL0    = 0xdf4c, // 11  011  1110  1001  100
     PMEVCNTR13_EL0    = 0xdf4d, // 11  011  1110  1001  101
     PMEVCNTR14_EL0    = 0xdf4e, // 11  011  1110  1001  110
     PMEVCNTR15_EL0    = 0xdf4f, // 11  011  1110  1001  111
     PMEVCNTR16_EL0    = 0xdf50, // 11  011  1110  1010  000
     PMEVCNTR17_EL0    = 0xdf51, // 11  011  1110  1010  001
     PMEVCNTR18_EL0    = 0xdf52, // 11  011  1110  1010  010
     PMEVCNTR19_EL0    = 0xdf53, // 11  011  1110  1010  011
     PMEVCNTR20_EL0    = 0xdf54, // 11  011  1110  1010  100
     PMEVCNTR21_EL0    = 0xdf55, // 11  011  1110  1010  101
     PMEVCNTR22_EL0    = 0xdf56, // 11  011  1110  1010  110
     PMEVCNTR23_EL0    = 0xdf57, // 11  011  1110  1010  111
     PMEVCNTR24_EL0    = 0xdf58, // 11  011  1110  1011  000
     PMEVCNTR25_EL0    = 0xdf59, // 11  011  1110  1011  001
     PMEVCNTR26_EL0    = 0xdf5a, // 11  011  1110  1011  010
     PMEVCNTR27_EL0    = 0xdf5b, // 11  011  1110  1011  011
     PMEVCNTR28_EL0    = 0xdf5c, // 11  011  1110  1011  100
     PMEVCNTR29_EL0    = 0xdf5d, // 11  011  1110  1011  101
     PMEVCNTR30_EL0    = 0xdf5e, // 11  011  1110  1011  110
     PMCCFILTR_EL0     = 0xdf7f, // 11  011  1110  1111  111
     PMEVTYPER0_EL0    = 0xdf60, // 11  011  1110  1100  000
     PMEVTYPER1_EL0    = 0xdf61, // 11  011  1110  1100  001
     PMEVTYPER2_EL0    = 0xdf62, // 11  011  1110  1100  010
     PMEVTYPER3_EL0    = 0xdf63, // 11  011  1110  1100  011
     PMEVTYPER4_EL0    = 0xdf64, // 11  011  1110  1100  100
     PMEVTYPER5_EL0    = 0xdf65, // 11  011  1110  1100  101
     PMEVTYPER6_EL0    = 0xdf66, // 11  011  1110  1100  110
     PMEVTYPER7_EL0    = 0xdf67, // 11  011  1110  1100  111
     PMEVTYPER8_EL0    = 0xdf68, // 11  011  1110  1101  000
     PMEVTYPER9_EL0    = 0xdf69, // 11  011  1110  1101  001
     PMEVTYPER10_EL0   = 0xdf6a, // 11  011  1110  1101  010
     PMEVTYPER11_EL0   = 0xdf6b, // 11  011  1110  1101  011
     PMEVTYPER12_EL0   = 0xdf6c, // 11  011  1110  1101  100
     PMEVTYPER13_EL0   = 0xdf6d, // 11  011  1110  1101  101
     PMEVTYPER14_EL0   = 0xdf6e, // 11  011  1110  1101  110
     PMEVTYPER15_EL0   = 0xdf6f, // 11  011  1110  1101  111
     PMEVTYPER16_EL0   = 0xdf70, // 11  011  1110  1110  000
     PMEVTYPER17_EL0   = 0xdf71, // 11  011  1110  1110  001
     PMEVTYPER18_EL0   = 0xdf72, // 11  011  1110  1110  010
     PMEVTYPER19_EL0   = 0xdf73, // 11  011  1110  1110  011
     PMEVTYPER20_EL0   = 0xdf74, // 11  011  1110  1110  100
     PMEVTYPER21_EL0   = 0xdf75, // 11  011  1110  1110  101
     PMEVTYPER22_EL0   = 0xdf76, // 11  011  1110  1110  110
     PMEVTYPER23_EL0   = 0xdf77, // 11  011  1110  1110  111
     PMEVTYPER24_EL0   = 0xdf78, // 11  011  1110  1111  000
     PMEVTYPER25_EL0   = 0xdf79, // 11  011  1110  1111  001
     PMEVTYPER26_EL0   = 0xdf7a, // 11  011  1110  1111  010
     PMEVTYPER27_EL0   = 0xdf7b, // 11  011  1110  1111  011
     PMEVTYPER28_EL0   = 0xdf7c, // 11  011  1110  1111  100
     PMEVTYPER29_EL0   = 0xdf7d, // 11  011  1110  1111  101
     PMEVTYPER30_EL0   = 0xdf7e, // 11  011  1110  1111  110
 
     // Trace registers
     TRCPRGCTLR        = 0x8808, // 10  001  0000  0001  000
     TRCPROCSELR       = 0x8810, // 10  001  0000  0010  000
     TRCCONFIGR        = 0x8820, // 10  001  0000  0100  000
     TRCAUXCTLR        = 0x8830, // 10  001  0000  0110  000
     TRCEVENTCTL0R     = 0x8840, // 10  001  0000  1000  000
     TRCEVENTCTL1R     = 0x8848, // 10  001  0000  1001  000
     TRCSTALLCTLR      = 0x8858, // 10  001  0000  1011  000
     TRCTSCTLR         = 0x8860, // 10  001  0000  1100  000
     TRCSYNCPR         = 0x8868, // 10  001  0000  1101  000
     TRCCCCTLR         = 0x8870, // 10  001  0000  1110  000
     TRCBBCTLR         = 0x8878, // 10  001  0000  1111  000
     TRCTRACEIDR       = 0x8801, // 10  001  0000  0000  001
     TRCQCTLR          = 0x8809, // 10  001  0000  0001  001
     TRCVICTLR         = 0x8802, // 10  001  0000  0000  010
     TRCVIIECTLR       = 0x880a, // 10  001  0000  0001  010
     TRCVISSCTLR       = 0x8812, // 10  001  0000  0010  010
     TRCVIPCSSCTLR     = 0x881a, // 10  001  0000  0011  010
     TRCVDCTLR         = 0x8842, // 10  001  0000  1000  010
     TRCVDSACCTLR      = 0x884a, // 10  001  0000  1001  010
     TRCVDARCCTLR      = 0x8852, // 10  001  0000  1010  010
     TRCSEQEVR0        = 0x8804, // 10  001  0000  0000  100
     TRCSEQEVR1        = 0x880c, // 10  001  0000  0001  100
     TRCSEQEVR2        = 0x8814, // 10  001  0000  0010  100
     TRCSEQRSTEVR      = 0x8834, // 10  001  0000  0110  100
     TRCSEQSTR         = 0x883c, // 10  001  0000  0111  100
     TRCEXTINSELR      = 0x8844, // 10  001  0000  1000  100
     TRCCNTRLDVR0      = 0x8805, // 10  001  0000  0000  101
     TRCCNTRLDVR1      = 0x880d, // 10  001  0000  0001  101
     TRCCNTRLDVR2      = 0x8815, // 10  001  0000  0010  101
     TRCCNTRLDVR3      = 0x881d, // 10  001  0000  0011  101
     TRCCNTCTLR0       = 0x8825, // 10  001  0000  0100  101
     TRCCNTCTLR1       = 0x882d, // 10  001  0000  0101  101
     TRCCNTCTLR2       = 0x8835, // 10  001  0000  0110  101
     TRCCNTCTLR3       = 0x883d, // 10  001  0000  0111  101
     TRCCNTVR0         = 0x8845, // 10  001  0000  1000  101
     TRCCNTVR1         = 0x884d, // 10  001  0000  1001  101
     TRCCNTVR2         = 0x8855, // 10  001  0000  1010  101
     TRCCNTVR3         = 0x885d, // 10  001  0000  1011  101
     TRCIMSPEC0        = 0x8807, // 10  001  0000  0000  111
     TRCIMSPEC1        = 0x880f, // 10  001  0000  0001  111
     TRCIMSPEC2        = 0x8817, // 10  001  0000  0010  111
     TRCIMSPEC3        = 0x881f, // 10  001  0000  0011  111
     TRCIMSPEC4        = 0x8827, // 10  001  0000  0100  111
     TRCIMSPEC5        = 0x882f, // 10  001  0000  0101  111
     TRCIMSPEC6        = 0x8837, // 10  001  0000  0110  111
     TRCIMSPEC7        = 0x883f, // 10  001  0000  0111  111
     TRCRSCTLR2        = 0x8890, // 10  001  0001  0010  000
     TRCRSCTLR3        = 0x8898, // 10  001  0001  0011  000
     TRCRSCTLR4        = 0x88a0, // 10  001  0001  0100  000
     TRCRSCTLR5        = 0x88a8, // 10  001  0001  0101  000
     TRCRSCTLR6        = 0x88b0, // 10  001  0001  0110  000
     TRCRSCTLR7        = 0x88b8, // 10  001  0001  0111  000
     TRCRSCTLR8        = 0x88c0, // 10  001  0001  1000  000
     TRCRSCTLR9        = 0x88c8, // 10  001  0001  1001  000
     TRCRSCTLR10       = 0x88d0, // 10  001  0001  1010  000
     TRCRSCTLR11       = 0x88d8, // 10  001  0001  1011  000
     TRCRSCTLR12       = 0x88e0, // 10  001  0001  1100  000
     TRCRSCTLR13       = 0x88e8, // 10  001  0001  1101  000
     TRCRSCTLR14       = 0x88f0, // 10  001  0001  1110  000
     TRCRSCTLR15       = 0x88f8, // 10  001  0001  1111  000
     TRCRSCTLR16       = 0x8881, // 10  001  0001  0000  001
     TRCRSCTLR17       = 0x8889, // 10  001  0001  0001  001
     TRCRSCTLR18       = 0x8891, // 10  001  0001  0010  001
     TRCRSCTLR19       = 0x8899, // 10  001  0001  0011  001
     TRCRSCTLR20       = 0x88a1, // 10  001  0001  0100  001
     TRCRSCTLR21       = 0x88a9, // 10  001  0001  0101  001
     TRCRSCTLR22       = 0x88b1, // 10  001  0001  0110  001
     TRCRSCTLR23       = 0x88b9, // 10  001  0001  0111  001
     TRCRSCTLR24       = 0x88c1, // 10  001  0001  1000  001
     TRCRSCTLR25       = 0x88c9, // 10  001  0001  1001  001
     TRCRSCTLR26       = 0x88d1, // 10  001  0001  1010  001
     TRCRSCTLR27       = 0x88d9, // 10  001  0001  1011  001
     TRCRSCTLR28       = 0x88e1, // 10  001  0001  1100  001
     TRCRSCTLR29       = 0x88e9, // 10  001  0001  1101  001
     TRCRSCTLR30       = 0x88f1, // 10  001  0001  1110  001
     TRCRSCTLR31       = 0x88f9, // 10  001  0001  1111  001
     TRCSSCCR0         = 0x8882, // 10  001  0001  0000  010
     TRCSSCCR1         = 0x888a, // 10  001  0001  0001  010
     TRCSSCCR2         = 0x8892, // 10  001  0001  0010  010
     TRCSSCCR3         = 0x889a, // 10  001  0001  0011  010
     TRCSSCCR4         = 0x88a2, // 10  001  0001  0100  010
     TRCSSCCR5         = 0x88aa, // 10  001  0001  0101  010
     TRCSSCCR6         = 0x88b2, // 10  001  0001  0110  010
     TRCSSCCR7         = 0x88ba, // 10  001  0001  0111  010
     TRCSSCSR0         = 0x88c2, // 10  001  0001  1000  010
     TRCSSCSR1         = 0x88ca, // 10  001  0001  1001  010
     TRCSSCSR2         = 0x88d2, // 10  001  0001  1010  010
     TRCSSCSR3         = 0x88da, // 10  001  0001  1011  010
     TRCSSCSR4         = 0x88e2, // 10  001  0001  1100  010
     TRCSSCSR5         = 0x88ea, // 10  001  0001  1101  010
     TRCSSCSR6         = 0x88f2, // 10  001  0001  1110  010
     TRCSSCSR7         = 0x88fa, // 10  001  0001  1111  010
     TRCSSPCICR0       = 0x8883, // 10  001  0001  0000  011
     TRCSSPCICR1       = 0x888b, // 10  001  0001  0001  011
     TRCSSPCICR2       = 0x8893, // 10  001  0001  0010  011
     TRCSSPCICR3       = 0x889b, // 10  001  0001  0011  011
     TRCSSPCICR4       = 0x88a3, // 10  001  0001  0100  011
     TRCSSPCICR5       = 0x88ab, // 10  001  0001  0101  011
     TRCSSPCICR6       = 0x88b3, // 10  001  0001  0110  011
     TRCSSPCICR7       = 0x88bb, // 10  001  0001  0111  011
     TRCPDCR           = 0x88a4, // 10  001  0001  0100  100
     TRCACVR0          = 0x8900, // 10  001  0010  0000  000
     TRCACVR1          = 0x8910, // 10  001  0010  0010  000
     TRCACVR2          = 0x8920, // 10  001  0010  0100  000
     TRCACVR3          = 0x8930, // 10  001  0010  0110  000
     TRCACVR4          = 0x8940, // 10  001  0010  1000  000
     TRCACVR5          = 0x8950, // 10  001  0010  1010  000
     TRCACVR6          = 0x8960, // 10  001  0010  1100  000
     TRCACVR7          = 0x8970, // 10  001  0010  1110  000
     TRCACVR8          = 0x8901, // 10  001  0010  0000  001
     TRCACVR9          = 0x8911, // 10  001  0010  0010  001
     TRCACVR10         = 0x8921, // 10  001  0010  0100  001
     TRCACVR11         = 0x8931, // 10  001  0010  0110  001
     TRCACVR12         = 0x8941, // 10  001  0010  1000  001
     TRCACVR13         = 0x8951, // 10  001  0010  1010  001
     TRCACVR14         = 0x8961, // 10  001  0010  1100  001
     TRCACVR15         = 0x8971, // 10  001  0010  1110  001
     TRCACATR0         = 0x8902, // 10  001  0010  0000  010
     TRCACATR1         = 0x8912, // 10  001  0010  0010  010
     TRCACATR2         = 0x8922, // 10  001  0010  0100  010
     TRCACATR3         = 0x8932, // 10  001  0010  0110  010
     TRCACATR4         = 0x8942, // 10  001  0010  1000  010
     TRCACATR5         = 0x8952, // 10  001  0010  1010  010
     TRCACATR6         = 0x8962, // 10  001  0010  1100  010
     TRCACATR7         = 0x8972, // 10  001  0010  1110  010
     TRCACATR8         = 0x8903, // 10  001  0010  0000  011
     TRCACATR9         = 0x8913, // 10  001  0010  0010  011
     TRCACATR10        = 0x8923, // 10  001  0010  0100  011
     TRCACATR11        = 0x8933, // 10  001  0010  0110  011
     TRCACATR12        = 0x8943, // 10  001  0010  1000  011
     TRCACATR13        = 0x8953, // 10  001  0010  1010  011
     TRCACATR14        = 0x8963, // 10  001  0010  1100  011
     TRCACATR15        = 0x8973, // 10  001  0010  1110  011
     TRCDVCVR0         = 0x8904, // 10  001  0010  0000  100
     TRCDVCVR1         = 0x8924, // 10  001  0010  0100  100
     TRCDVCVR2         = 0x8944, // 10  001  0010  1000  100
     TRCDVCVR3         = 0x8964, // 10  001  0010  1100  100
     TRCDVCVR4         = 0x8905, // 10  001  0010  0000  101
     TRCDVCVR5         = 0x8925, // 10  001  0010  0100  101
     TRCDVCVR6         = 0x8945, // 10  001  0010  1000  101
     TRCDVCVR7         = 0x8965, // 10  001  0010  1100  101
     TRCDVCMR0         = 0x8906, // 10  001  0010  0000  110
     TRCDVCMR1         = 0x8926, // 10  001  0010  0100  110
     TRCDVCMR2         = 0x8946, // 10  001  0010  1000  110
     TRCDVCMR3         = 0x8966, // 10  001  0010  1100  110
     TRCDVCMR4         = 0x8907, // 10  001  0010  0000  111
     TRCDVCMR5         = 0x8927, // 10  001  0010  0100  111
     TRCDVCMR6         = 0x8947, // 10  001  0010  1000  111
     TRCDVCMR7         = 0x8967, // 10  001  0010  1100  111
     TRCCIDCVR0        = 0x8980, // 10  001  0011  0000  000
     TRCCIDCVR1        = 0x8990, // 10  001  0011  0010  000
     TRCCIDCVR2        = 0x89a0, // 10  001  0011  0100  000
     TRCCIDCVR3        = 0x89b0, // 10  001  0011  0110  000
     TRCCIDCVR4        = 0x89c0, // 10  001  0011  1000  000
     TRCCIDCVR5        = 0x89d0, // 10  001  0011  1010  000
     TRCCIDCVR6        = 0x89e0, // 10  001  0011  1100  000
     TRCCIDCVR7        = 0x89f0, // 10  001  0011  1110  000
     TRCVMIDCVR0       = 0x8981, // 10  001  0011  0000  001
     TRCVMIDCVR1       = 0x8991, // 10  001  0011  0010  001
     TRCVMIDCVR2       = 0x89a1, // 10  001  0011  0100  001
     TRCVMIDCVR3       = 0x89b1, // 10  001  0011  0110  001
     TRCVMIDCVR4       = 0x89c1, // 10  001  0011  1000  001
     TRCVMIDCVR5       = 0x89d1, // 10  001  0011  1010  001
     TRCVMIDCVR6       = 0x89e1, // 10  001  0011  1100  001
     TRCVMIDCVR7       = 0x89f1, // 10  001  0011  1110  001
     TRCCIDCCTLR0      = 0x8982, // 10  001  0011  0000  010
     TRCCIDCCTLR1      = 0x898a, // 10  001  0011  0001  010
     TRCVMIDCCTLR0     = 0x8992, // 10  001  0011  0010  010
     TRCVMIDCCTLR1     = 0x899a, // 10  001  0011  0011  010
     TRCITCTRL         = 0x8b84, // 10  001  0111  0000  100
     TRCCLAIMSET       = 0x8bc6, // 10  001  0111  1000  110
     TRCCLAIMCLR       = 0x8bce, // 10  001  0111  1001  110
 
     // GICv3 registers
     ICC_BPR1_EL1      = 0xc663, // 11  000  1100  1100  011
     ICC_BPR0_EL1      = 0xc643, // 11  000  1100  1000  011
     ICC_PMR_EL1       = 0xc230, // 11  000  0100  0110  000
     ICC_CTLR_EL1      = 0xc664, // 11  000  1100  1100  100
     ICC_CTLR_EL3      = 0xf664, // 11  110  1100  1100  100
     ICC_SRE_EL1       = 0xc665, // 11  000  1100  1100  101
     ICC_SRE_EL2       = 0xe64d, // 11  100  1100  1001  101
     ICC_SRE_EL3       = 0xf665, // 11  110  1100  1100  101
     ICC_IGRPEN0_EL1   = 0xc666, // 11  000  1100  1100  110
     ICC_IGRPEN1_EL1   = 0xc667, // 11  000  1100  1100  111
     ICC_IGRPEN1_EL3   = 0xf667, // 11  110  1100  1100  111
     ICC_SEIEN_EL1     = 0xc668, // 11  000  1100  1101  000
     ICC_AP0R0_EL1     = 0xc644, // 11  000  1100  1000  100
     ICC_AP0R1_EL1     = 0xc645, // 11  000  1100  1000  101
     ICC_AP0R2_EL1     = 0xc646, // 11  000  1100  1000  110
     ICC_AP0R3_EL1     = 0xc647, // 11  000  1100  1000  111
     ICC_AP1R0_EL1     = 0xc648, // 11  000  1100  1001  000
     ICC_AP1R1_EL1     = 0xc649, // 11  000  1100  1001  001
     ICC_AP1R2_EL1     = 0xc64a, // 11  000  1100  1001  010
     ICC_AP1R3_EL1     = 0xc64b, // 11  000  1100  1001  011
     ICH_AP0R0_EL2     = 0xe640, // 11  100  1100  1000  000
     ICH_AP0R1_EL2     = 0xe641, // 11  100  1100  1000  001
     ICH_AP0R2_EL2     = 0xe642, // 11  100  1100  1000  010
     ICH_AP0R3_EL2     = 0xe643, // 11  100  1100  1000  011
     ICH_AP1R0_EL2     = 0xe648, // 11  100  1100  1001  000
     ICH_AP1R1_EL2     = 0xe649, // 11  100  1100  1001  001
     ICH_AP1R2_EL2     = 0xe64a, // 11  100  1100  1001  010
     ICH_AP1R3_EL2     = 0xe64b, // 11  100  1100  1001  011
     ICH_HCR_EL2       = 0xe658, // 11  100  1100  1011  000
     ICH_MISR_EL2      = 0xe65a, // 11  100  1100  1011  010
     ICH_VMCR_EL2      = 0xe65f, // 11  100  1100  1011  111
     ICH_VSEIR_EL2     = 0xe64c, // 11  100  1100  1001  100
     ICH_LR0_EL2       = 0xe660, // 11  100  1100  1100  000
     ICH_LR1_EL2       = 0xe661, // 11  100  1100  1100  001
     ICH_LR2_EL2       = 0xe662, // 11  100  1100  1100  010
     ICH_LR3_EL2       = 0xe663, // 11  100  1100  1100  011
     ICH_LR4_EL2       = 0xe664, // 11  100  1100  1100  100
     ICH_LR5_EL2       = 0xe665, // 11  100  1100  1100  101
     ICH_LR6_EL2       = 0xe666, // 11  100  1100  1100  110
     ICH_LR7_EL2       = 0xe667, // 11  100  1100  1100  111
     ICH_LR8_EL2       = 0xe668, // 11  100  1100  1101  000
     ICH_LR9_EL2       = 0xe669, // 11  100  1100  1101  001
     ICH_LR10_EL2      = 0xe66a, // 11  100  1100  1101  010
     ICH_LR11_EL2      = 0xe66b, // 11  100  1100  1101  011
     ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
     ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
     ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
     ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
   };
 
   // Cyclone specific system registers
   enum CycloneSysRegValues {
     CPM_IOACC_CTL_EL3 = 0xff90
   };
 
   // Note that these do not inherit from AArch64NamedImmMapper. This class is
   // sufficiently different in its behaviour that I don't believe it's worth
   // burdening the common AArch64NamedImmMapper with abstractions only needed in
   // this one case.
   struct SysRegMapper {
     static const AArch64NamedImmMapper::Mapping SysRegPairs[];
     static const AArch64NamedImmMapper::Mapping CycloneSysRegPairs[];
 
     const AArch64NamedImmMapper::Mapping *InstPairs;
     size_t NumInstPairs;
     uint64_t FeatureBits;
 
     SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
     uint32_t fromString(StringRef Name, bool &Valid) const;
     std::string toString(uint32_t Bits) const;
   };
 
   struct MSRMapper : SysRegMapper {
     static const AArch64NamedImmMapper::Mapping MSRPairs[];
     MSRMapper(uint64_t FeatureBits);
   };
 
   struct MRSMapper : SysRegMapper {
     static const AArch64NamedImmMapper::Mapping MRSPairs[];
     MRSMapper(uint64_t FeatureBits);
   };
 
   uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
 }
 
 namespace AArch64TLBI {
   enum TLBIValues {
     Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
     IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
     IPAS2LE1IS   = 0x6405, // 01  100  1000  0000  101
     VMALLE1IS    = 0x4418, // 01  000  1000  0011  000
     ALLE2IS      = 0x6418, // 01  100  1000  0011  000
     ALLE3IS      = 0x7418, // 01  110  1000  0011  000
     VAE1IS       = 0x4419, // 01  000  1000  0011  001
     VAE2IS       = 0x6419, // 01  100  1000  0011  001
     VAE3IS       = 0x7419, // 01  110  1000  0011  001
     ASIDE1IS     = 0x441a, // 01  000  1000  0011  010
     VAAE1IS      = 0x441b, // 01  000  1000  0011  011
     ALLE1IS      = 0x641c, // 01  100  1000  0011  100
     VALE1IS      = 0x441d, // 01  000  1000  0011  101
     VALE2IS      = 0x641d, // 01  100  1000  0011  101
     VALE3IS      = 0x741d, // 01  110  1000  0011  101
     VMALLS12E1IS = 0x641e, // 01  100  1000  0011  110
     VAALE1IS     = 0x441f, // 01  000  1000  0011  111
     IPAS2E1      = 0x6421, // 01  100  1000  0100  001
     IPAS2LE1     = 0x6425, // 01  100  1000  0100  101
     VMALLE1      = 0x4438, // 01  000  1000  0111  000
     ALLE2        = 0x6438, // 01  100  1000  0111  000
     ALLE3        = 0x7438, // 01  110  1000  0111  000
     VAE1         = 0x4439, // 01  000  1000  0111  001
     VAE2         = 0x6439, // 01  100  1000  0111  001
     VAE3         = 0x7439, // 01  110  1000  0111  001
     ASIDE1       = 0x443a, // 01  000  1000  0111  010
     VAAE1        = 0x443b, // 01  000  1000  0111  011
     ALLE1        = 0x643c, // 01  100  1000  0111  100
     VALE1        = 0x443d, // 01  000  1000  0111  101
     VALE2        = 0x643d, // 01  100  1000  0111  101
     VALE3        = 0x743d, // 01  110  1000  0111  101
     VMALLS12E1   = 0x643e, // 01  100  1000  0111  110
     VAALE1       = 0x443f  // 01  000  1000  0111  111
   };
 
   struct TLBIMapper : AArch64NamedImmMapper {
     const static Mapping TLBIPairs[];
 
     TLBIMapper();
   };
 
   static inline bool NeedsRegister(TLBIValues Val) {
     switch (Val) {
     case VMALLE1IS:
     case ALLE2IS:
     case ALLE3IS:
     case ALLE1IS:
     case VMALLS12E1IS:
     case VMALLE1:
     case ALLE2:
     case ALLE3:
     case ALLE1:
     case VMALLS12E1:
       return false;
     default:
       return true;
     }
   }
 } 
 
 namespace AArch64II {
   /// Target Operand Flag enum.
   enum TOF {
     //===------------------------------------------------------------------===//
     // AArch64 Specific MachineOperand flags.
 
     MO_NO_FLAG,
 
-    MO_FRAGMENT = 0x7,
+    MO_FRAGMENT = 0xf,
 
     /// MO_PAGE - A symbol operand with this flag represents the pc-relative
     /// offset of the 4K page containing the symbol.  This is used with the
     /// ADRP instruction.
     MO_PAGE = 1,
 
     /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
     /// that symbol within a 4K page.  This offset is added to the page address
     /// to produce the complete address.
     MO_PAGEOFF = 2,
 
     /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
     /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
     MO_G3 = 3,
 
     /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
     /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
     MO_G2 = 4,
 
     /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
     /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
     MO_G1 = 5,
 
     /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
     /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
     MO_G0 = 6,
 
+    /// MO_HI12 - This flag indicates that a symbol operand represents the bits
+    /// 13-24 of a 64-bit address, used in a arithmetic immediate-shifted-left-
+    /// by-12-bits instruction.
+    MO_HI12 = 7,
+
     /// MO_GOT - This flag indicates that a symbol operand represents the
     /// address of the GOT entry for the symbol, rather than the address of
     /// the symbol itself.
-    MO_GOT = 8,
+    MO_GOT = 0x10,
 
     /// MO_NC - Indicates whether the linker is expected to check the symbol
     /// reference for overflow. For example in an ADRP/ADD pair of relocations
     /// the ADRP usually does check, but not the ADD.
-    MO_NC = 0x10,
+    MO_NC = 0x20,
 
     /// MO_TLS - Indicates that the operand being accessed is some kind of
     /// thread-local symbol. On Darwin, only one type of thread-local access
     /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
     /// referee will affect interpretation.
-    MO_TLS = 0x20,
+    MO_TLS = 0x40,
 
     /// MO_CONSTPOOL - This flag indicates that a symbol operand represents
     /// the address of a constant pool entry for the symbol, rather than the
     /// address of the symbol itself.
-    MO_CONSTPOOL = 0x40
+    MO_CONSTPOOL = 0x80
   };
 } // end namespace AArch64II
 
 } // end namespace llvm
 
 #endif