diff --git a/contrib/llvm/include/llvm/Target/TargetFrameLowering.h b/contrib/llvm/include/llvm/Target/TargetFrameLowering.h
index 277bd98d371c..f17640f71e93 100644
--- a/contrib/llvm/include/llvm/Target/TargetFrameLowering.h
+++ b/contrib/llvm/include/llvm/Target/TargetFrameLowering.h
@@ -1,251 +1,256 @@
 //===-- llvm/Target/TargetFrameLowering.h ---------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // Interface to describe the layout of a stack frame on the target machine.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_TARGET_TARGETFRAMELOWERING_H
 #define LLVM_TARGET_TARGETFRAMELOWERING_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include <utility>
 #include <vector>
 
 namespace llvm {
   class CalleeSavedInfo;
   class MachineFunction;
   class RegScavenger;
 
 /// Information about stack frame layout on the target.  It holds the direction
 /// of stack growth, the known stack alignment on entry to each function, and
 /// the offset to the locals area.
 ///
 /// The offset to the local area is the offset from the stack pointer on
 /// function entry to the first location where function data (local variables,
 /// spill locations) can be stored.
 class TargetFrameLowering {
 public:
   enum StackDirection {
     StackGrowsUp,        // Adding to the stack increases the stack address
     StackGrowsDown       // Adding to the stack decreases the stack address
   };
 
   // Maps a callee saved register to a stack slot with a fixed offset.
   struct SpillSlot {
     unsigned Reg;
     int Offset; // Offset relative to stack pointer on function entry.
   };
 private:
   StackDirection StackDir;
   unsigned StackAlignment;
   unsigned TransientStackAlignment;
   int LocalAreaOffset;
   bool StackRealignable;
 public:
   TargetFrameLowering(StackDirection D, unsigned StackAl, int LAO,
                       unsigned TransAl = 1, bool StackReal = true)
     : StackDir(D), StackAlignment(StackAl), TransientStackAlignment(TransAl),
       LocalAreaOffset(LAO), StackRealignable(StackReal) {}
 
   virtual ~TargetFrameLowering();
 
   // These methods return information that describes the abstract stack layout
   // of the target machine.
 
   /// getStackGrowthDirection - Return the direction the stack grows
   ///
   StackDirection getStackGrowthDirection() const { return StackDir; }
 
   /// getStackAlignment - This method returns the number of bytes to which the
   /// stack pointer must be aligned on entry to a function.  Typically, this
   /// is the largest alignment for any data object in the target.
   ///
   unsigned getStackAlignment() const { return StackAlignment; }
 
   /// getTransientStackAlignment - This method returns the number of bytes to
   /// which the stack pointer must be aligned at all times, even between
   /// calls.
   ///
   unsigned getTransientStackAlignment() const {
     return TransientStackAlignment;
   }
 
   /// isStackRealignable - This method returns whether the stack can be
   /// realigned.
   bool isStackRealignable() const {
     return StackRealignable;
   }
 
   /// getOffsetOfLocalArea - This method returns the offset of the local area
   /// from the stack pointer on entrance to a function.
   ///
   int getOffsetOfLocalArea() const { return LocalAreaOffset; }
 
   /// isFPCloseToIncomingSP - Return true if the frame pointer is close to
   /// the incoming stack pointer, false if it is close to the post-prologue
   /// stack pointer.
   virtual bool isFPCloseToIncomingSP() const { return true; }
 
   /// assignCalleeSavedSpillSlots - Allows target to override spill slot
   /// assignment logic.  If implemented, assignCalleeSavedSpillSlots() should
   /// assign frame slots to all CSI entries and return true.  If this method
   /// returns false, spill slots will be assigned using generic implementation.
   /// assignCalleeSavedSpillSlots() may add, delete or rearrange elements of
   /// CSI.
   virtual bool
   assignCalleeSavedSpillSlots(MachineFunction &MF,
                               const TargetRegisterInfo *TRI,
                               std::vector<CalleeSavedInfo> &CSI) const {
     return false;
   }
 
   /// getCalleeSavedSpillSlots - This method returns a pointer to an array of
   /// pairs, that contains an entry for each callee saved register that must be
   /// spilled to a particular stack location if it is spilled.
   ///
   /// Each entry in this array contains a <register,offset> pair, indicating the
   /// fixed offset from the incoming stack pointer that each register should be
   /// spilled at. If a register is not listed here, the code generator is
   /// allowed to spill it anywhere it chooses.
   ///
   virtual const SpillSlot *
   getCalleeSavedSpillSlots(unsigned &NumEntries) const {
     NumEntries = 0;
     return nullptr;
   }
 
   /// targetHandlesStackFrameRounding - Returns true if the target is
   /// responsible for rounding up the stack frame (probably at emitPrologue
   /// time).
   virtual bool targetHandlesStackFrameRounding() const {
     return false;
   }
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
   virtual void emitPrologue(MachineFunction &MF) const = 0;
   virtual void emitEpilogue(MachineFunction &MF,
                             MachineBasicBlock &MBB) const = 0;
 
   /// Adjust the prologue to have the function use segmented stacks. This works
   /// by adding a check even before the "normal" function prologue.
   virtual void adjustForSegmentedStacks(MachineFunction &MF) const { }
 
   /// Adjust the prologue to add Erlang Run-Time System (ERTS) specific code in
   /// the assembly prologue to explicitly handle the stack.
   virtual void adjustForHiPEPrologue(MachineFunction &MF) const { }
 
   /// Adjust the prologue to add an allocation at a fixed offset from the frame
   /// pointer.
   virtual void adjustForFrameAllocatePrologue(MachineFunction &MF) const { }
 
   /// spillCalleeSavedRegisters - Issues instruction(s) to spill all callee
   /// saved registers and returns true if it isn't possible / profitable to do
   /// so by issuing a series of store instructions via
   /// storeRegToStackSlot(). Returns false otherwise.
   virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator MI,
                                         const std::vector<CalleeSavedInfo> &CSI,
                                          const TargetRegisterInfo *TRI) const {
     return false;
   }
 
   /// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee
   /// saved registers and returns true if it isn't possible / profitable to do
   /// so by issuing a series of load instructions via loadRegToStackSlot().
   /// Returns false otherwise.
   virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MI,
                                         const std::vector<CalleeSavedInfo> &CSI,
                                         const TargetRegisterInfo *TRI) const {
     return false;
   }
 
   /// hasFP - Return true if the specified function should have a dedicated
   /// frame pointer register. For most targets this is true only if the function
   /// has variable sized allocas or if frame pointer elimination is disabled.
   virtual bool hasFP(const MachineFunction &MF) const = 0;
 
   /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
   /// not required, we reserve argument space for call sites in the function
   /// immediately on entry to the current function. This eliminates the need for
   /// add/sub sp brackets around call sites. Returns true if the call frame is
   /// included as part of the stack frame.
   virtual bool hasReservedCallFrame(const MachineFunction &MF) const {
     return !hasFP(MF);
   }
 
   /// canSimplifyCallFramePseudos - When possible, it's best to simplify the
   /// call frame pseudo ops before doing frame index elimination. This is
   /// possible only when frame index references between the pseudos won't
   /// need adjusting for the call frame adjustments. Normally, that's true
   /// if the function has a reserved call frame or a frame pointer. Some
   /// targets (Thumb2, for example) may have more complicated criteria,
   /// however, and can override this behavior.
   virtual bool canSimplifyCallFramePseudos(const MachineFunction &MF) const {
     return hasReservedCallFrame(MF) || hasFP(MF);
   }
 
+  // needsFrameIndexResolution - Do we need to perform FI resolution for
+  // this function. Normally, this is required only when the function
+  // has any stack objects. However, targets may want to override this.
+  virtual bool needsFrameIndexResolution(const MachineFunction &MF) const;
+
   /// getFrameIndexOffset - Returns the displacement from the frame register to
   /// the stack frame of the specified index.
   virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
 
   /// getFrameIndexReference - This method should return the base register
   /// and offset used to reference a frame index location. The offset is
   /// returned directly, and the base register is returned via FrameReg.
   virtual int getFrameIndexReference(const MachineFunction &MF, int FI,
                                      unsigned &FrameReg) const;
 
   /// Same as above, except that the 'base register' will always be RSP, not
   /// RBP on x86.  This is used exclusively for lowering STATEPOINT nodes.
   /// TODO: This should really be a parameterizable choice.
   virtual int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
                                           unsigned &FrameReg) const {
     // default to calling normal version, we override this on x86 only
     llvm_unreachable("unimplemented for non-x86");
     return 0;
   }
 
   /// processFunctionBeforeCalleeSavedScan - This method is called immediately
   /// before PrologEpilogInserter scans the physical registers used to determine
   /// what callee saved registers should be spilled. This method is optional.
   virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                              RegScavenger *RS = nullptr) const {
 
   }
 
   /// processFunctionBeforeFrameFinalized - This method is called immediately
   /// before the specified function's frame layout (MF.getFrameInfo()) is
   /// finalized.  Once the frame is finalized, MO_FrameIndex operands are
   /// replaced with direct constants.  This method is optional.
   ///
   virtual void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                              RegScavenger *RS = nullptr) const {
   }
 
   /// eliminateCallFramePseudoInstr - This method is called during prolog/epilog
   /// code insertion to eliminate call frame setup and destroy pseudo
   /// instructions (but only if the Target is using them).  It is responsible
   /// for eliminating these instructions, replacing them with concrete
   /// instructions.  This method need only be implemented if using call frame
   /// setup/destroy pseudo instructions.
   ///
   virtual void
   eliminateCallFramePseudoInstr(MachineFunction &MF,
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const {
     llvm_unreachable("Call Frame Pseudo Instructions do not exist on this "
                      "target!");
   }
 };
 
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 385e5a35afba..61407faaf327 100644
--- a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -1,957 +1,961 @@
 //===-- PrologEpilogInserter.cpp - Insert Prolog/Epilog code in function --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This pass is responsible for finalizing the functions frame layout, saving
 // callee saved registers, and for emitting prolog & epilog code for the
 // function.
 //
 // This pass must be run after register allocation.  After this pass is
 // executed, it is illegal to construct MO_FrameIndex operands.
 //
 //===----------------------------------------------------------------------===//
 
 #include "PrologEpilogInserter.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <climits>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "pei"
 
 char PEI::ID = 0;
 char &llvm::PrologEpilogCodeInserterID = PEI::ID;
 
 static cl::opt<unsigned>
 WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
               cl::desc("Warn for stack size bigger than the given"
                        " number"));
 
 INITIALIZE_PASS_BEGIN(PEI, "prologepilog",
                 "Prologue/Epilogue Insertion", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(StackProtector)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(PEI, "prologepilog",
                     "Prologue/Epilogue Insertion & Frame Finalization",
                     false, false)
 
 STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
 STATISTIC(NumBytesStackSpace,
           "Number of bytes used for stack in all functions");
 
 void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addPreserved<MachineLoopInfo>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<StackProtector>();
   AU.addRequired<TargetPassConfig>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
   return (MBB && !MBB->empty() && MBB->back().isReturn());
 }
 
 /// Compute the set of return blocks
 void PEI::calculateSets(MachineFunction &Fn) {
   // Sets used to compute spill, restore placement sets.
   const std::vector<CalleeSavedInfo> &CSI =
     Fn.getFrameInfo()->getCalleeSavedInfo();
 
   // If no CSRs used, we are done.
   if (CSI.empty())
     return;
 
   // Save refs to entry and return blocks.
   EntryBlock = Fn.begin();
   for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
        MBB != E; ++MBB)
     if (isReturnBlock(MBB))
       ReturnBlocks.push_back(MBB);
 
   return;
 }
 
 /// StackObjSet - A set of stack object indexes
 typedef SmallSetVector<int, 8> StackObjSet;
 
 /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
 /// frame indexes with appropriate references.
 ///
 bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   const Function* F = Fn.getFunction();
   const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
 
   assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs");
 
   RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr;
   FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
 
   // Calculate the MaxCallFrameSize and AdjustsStack variables for the
   // function's frame information. Also eliminates call frame pseudo
   // instructions.
   calculateCallsInformation(Fn);
 
   // Allow the target machine to make some adjustments to the function
   // e.g. UsedPhysRegs before calculateCalleeSavedRegisters.
   TFI->processFunctionBeforeCalleeSavedScan(Fn, RS);
 
   // Scan the function for modified callee saved registers and insert spill code
   // for any callee saved registers that are modified.
   calculateCalleeSavedRegisters(Fn);
 
   // Determine placement of CSR spill/restore code:
   // place all spills in the entry block, all restores in return blocks.
   calculateSets(Fn);
 
   // Add the code to save and restore the callee saved registers
   if (!F->hasFnAttribute(Attribute::Naked))
     insertCSRSpillsAndRestores(Fn);
 
   // Allow the target machine to make final modifications to the function
   // before the frame layout is finalized.
   TFI->processFunctionBeforeFrameFinalized(Fn, RS);
 
   // Calculate actual frame offsets for all abstract stack objects...
   calculateFrameObjectOffsets(Fn);
 
   // Add prolog and epilog code to the function.  This function is required
   // to align the stack frame as necessary for any stack variables or
   // called functions.  Because of this, calculateCalleeSavedRegisters()
   // must be called before this function in order to set the AdjustsStack
   // and MaxCallFrameSize variables.
   if (!F->hasFnAttribute(Attribute::Naked))
     insertPrologEpilogCode(Fn);
 
   // Replace all MO_FrameIndex operands with physical register references
   // and actual offsets.
   //
   replaceFrameIndices(Fn);
 
   // If register scavenging is needed, as we've enabled doing it as a
   // post-pass, scavenge the virtual registers that frame index elimination
   // inserted.
   if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging)
     scavengeFrameVirtualRegs(Fn);
 
   // Clear any vregs created by virtual scavenging.
   Fn.getRegInfo().clearVirtRegs();
 
   // Warn on stack size when we exceeds the given limit.
   MachineFrameInfo *MFI = Fn.getFrameInfo();
   uint64_t StackSize = MFI->getStackSize();
   if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) {
     DiagnosticInfoStackSize DiagStackSize(*F, StackSize);
     F->getContext().diagnose(DiagStackSize);
   }
 
   delete RS;
   ReturnBlocks.clear();
   return true;
 }
 
 /// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack
 /// variables for the function's frame information and eliminate call frame
 /// pseudo instructions.
 void PEI::calculateCallsInformation(MachineFunction &Fn) {
   const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
   MachineFrameInfo *MFI = Fn.getFrameInfo();
 
   unsigned MaxCallFrameSize = 0;
   bool AdjustsStack = MFI->adjustsStack();
 
   // Get the function call frame set-up and tear-down instruction opcode
   int FrameSetupOpcode   = TII.getCallFrameSetupOpcode();
   int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
 
   // Early exit for targets which have no call frame setup/destroy pseudo
   // instructions.
   if (FrameSetupOpcode == -1 && FrameDestroyOpcode == -1)
     return;
 
   std::vector<MachineBasicBlock::iterator> FrameSDOps;
   for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
     for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
       if (I->getOpcode() == FrameSetupOpcode ||
           I->getOpcode() == FrameDestroyOpcode) {
         assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
                " instructions should have a single immediate argument!");
         unsigned Size = I->getOperand(0).getImm();
         if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
         AdjustsStack = true;
         FrameSDOps.push_back(I);
       } else if (I->isInlineAsm()) {
         // Some inline asm's need a stack frame, as indicated by operand 1.
         unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
         if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
           AdjustsStack = true;
       }
 
   MFI->setAdjustsStack(AdjustsStack);
   MFI->setMaxCallFrameSize(MaxCallFrameSize);
 
   for (std::vector<MachineBasicBlock::iterator>::iterator
          i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) {
     MachineBasicBlock::iterator I = *i;
 
     // If call frames are not being included as part of the stack frame, and
     // the target doesn't indicate otherwise, remove the call frame pseudos
     // here. The sub/add sp instruction pairs are still inserted, but we don't
     // need to track the SP adjustment for frame index elimination.
     if (TFI->canSimplifyCallFramePseudos(Fn))
       TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
   }
 }
 
 
 /// calculateCalleeSavedRegisters - Scan the function for modified callee saved
 /// registers.
 void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
   const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
   MachineFrameInfo *MFI = F.getFrameInfo();
 
   // Get the callee saved register list...
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
 
   // These are used to keep track the callee-save area. Initialize them.
   MinCSFrameIndex = INT_MAX;
   MaxCSFrameIndex = 0;
 
   // Early exit for targets which have no callee saved registers.
   if (!CSRegs || CSRegs[0] == 0)
     return;
 
   // In Naked functions we aren't going to save any registers.
   if (F.getFunction()->hasFnAttribute(Attribute::Naked))
     return;
 
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     // Functions which call __builtin_unwind_init get all their registers saved.
     if (F.getRegInfo().isPhysRegUsed(Reg) || F.getMMI().callsUnwindInit()) {
       // If the reg is modified, save it!
       CSI.push_back(CalleeSavedInfo(Reg));
     }
   }
 
   if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) {
     // If target doesn't implement this, use generic code.
 
     if (CSI.empty())
       return; // Early exit if no callee saved registers are modified!
 
     unsigned NumFixedSpillSlots;
     const TargetFrameLowering::SpillSlot *FixedSpillSlots =
         TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
 
     // Now that we know which registers need to be saved and restored, allocate
     // stack slots for them.
     for (std::vector<CalleeSavedInfo>::iterator I = CSI.begin(), E = CSI.end();
          I != E; ++I) {
       unsigned Reg = I->getReg();
       const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
 
       int FrameIdx;
       if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
         I->setFrameIdx(FrameIdx);
         continue;
       }
 
       // Check to see if this physreg must be spilled to a particular stack slot
       // on this target.
       const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots;
       while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots &&
              FixedSlot->Reg != Reg)
         ++FixedSlot;
 
       if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
         // Nope, just spill it anywhere convenient.
         unsigned Align = RC->getAlignment();
         unsigned StackAlign = TFI->getStackAlignment();
 
         // We may not be able to satisfy the desired alignment specification of
         // the TargetRegisterClass if the stack alignment is smaller. Use the
         // min.
         Align = std::min(Align, StackAlign);
         FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true);
         if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
         if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
       } else {
         // Spill it to the stack where we must.
         FrameIdx =
             MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset);
       }
 
       I->setFrameIdx(FrameIdx);
     }
   }
 
   MFI->setCalleeSavedInfo(CSI);
 }
 
 /// insertCSRSpillsAndRestores - Insert spill and restore code for
 /// callee saved registers used in the function.
 ///
 void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   // Get callee saved register information.
   MachineFrameInfo *MFI = Fn.getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
 
   MFI->setCalleeSavedInfoValid(true);
 
   // Early exit if no callee saved registers are modified!
   if (CSI.empty())
     return;
 
   const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
   const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
   MachineBasicBlock::iterator I;
 
   // Spill using target interface.
   I = EntryBlock->begin();
   if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
     for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
       // Add the callee-saved register as live-in.
       // It's killed at the spill.
       EntryBlock->addLiveIn(CSI[i].getReg());
 
       // Insert the spill to the stack frame.
       unsigned Reg = CSI[i].getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.storeRegToStackSlot(*EntryBlock, I, Reg, true, CSI[i].getFrameIdx(),
                               RC, TRI);
     }
   }
 
   // Restore using target interface.
   for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) {
     MachineBasicBlock *MBB = ReturnBlocks[ri];
     I = MBB->end();
     --I;
 
     // Skip over all terminator instructions, which are part of the return
     // sequence.
     MachineBasicBlock::iterator I2 = I;
     while (I2 != MBB->begin() && (--I2)->isTerminator())
       I = I2;
 
     bool AtStart = I == MBB->begin();
     MachineBasicBlock::iterator BeforeI = I;
     if (!AtStart)
       --BeforeI;
 
     // Restore all registers immediately before the return and any
     // terminators that precede it.
     if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
       for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
         unsigned Reg = CSI[i].getReg();
         const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
         TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
         assert(I != MBB->begin() &&
                "loadRegFromStackSlot didn't insert any code!");
         // Insert in reverse order.  loadRegFromStackSlot can insert
         // multiple instructions.
         if (AtStart)
           I = MBB->begin();
         else {
           I = BeforeI;
           ++I;
         }
       }
     }
   }
 }
 
 /// AdjustStackOffset - Helper function used to adjust the stack frame offset.
 static inline void
 AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
                   bool StackGrowsDown, int64_t &Offset,
                   unsigned &MaxAlign) {
   // If the stack grows down, add the object size to find the lowest address.
   if (StackGrowsDown)
     Offset += MFI->getObjectSize(FrameIdx);
 
   unsigned Align = MFI->getObjectAlignment(FrameIdx);
 
   // If the alignment of this object is greater than that of the stack, then
   // increase the stack alignment to match.
   MaxAlign = std::max(MaxAlign, Align);
 
   // Adjust to alignment boundary.
   Offset = (Offset + Align - 1) / Align * Align;
 
   if (StackGrowsDown) {
     DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
     MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
   } else {
     DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
     MFI->setObjectOffset(FrameIdx, Offset);
     Offset += MFI->getObjectSize(FrameIdx);
   }
 }
 
 /// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
 /// those required to be close to the Stack Protector) to stack offsets.
 static void
 AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
                       SmallSet<int, 16> &ProtectedObjs,
                       MachineFrameInfo *MFI, bool StackGrowsDown,
                       int64_t &Offset, unsigned &MaxAlign) {
 
   for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
         E = UnassignedObjs.end(); I != E; ++I) {
     int i = *I;
     AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
     ProtectedObjs.insert(i);
   }
 }
 
 /// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
 /// abstract stack objects.
 ///
 void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
   const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
   StackProtector *SP = &getAnalysis<StackProtector>();
 
   bool StackGrowsDown =
     TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
 
   // Loop over all of the stack objects, assigning sequential addresses...
   MachineFrameInfo *MFI = Fn.getFrameInfo();
 
   // Start at the beginning of the local area.
   // The Offset is the distance from the stack top in the direction
   // of stack growth -- so it's always nonnegative.
   int LocalAreaOffset = TFI.getOffsetOfLocalArea();
   if (StackGrowsDown)
     LocalAreaOffset = -LocalAreaOffset;
   assert(LocalAreaOffset >= 0
          && "Local area offset should be in direction of stack growth");
   int64_t Offset = LocalAreaOffset;
 
   // If there are fixed sized objects that are preallocated in the local area,
   // non-fixed objects can't be allocated right at the start of local area.
   // We currently don't support filling in holes in between fixed sized
   // objects, so we adjust 'Offset' to point to the end of last fixed sized
   // preallocated object.
   for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
     int64_t FixedOff;
     if (StackGrowsDown) {
       // The maximum distance from the stack pointer is at lower address of
       // the object -- which is given by offset. For down growing stack
       // the offset is negative, so we negate the offset to get the distance.
       FixedOff = -MFI->getObjectOffset(i);
     } else {
       // The maximum distance from the start pointer is at the upper
       // address of the object.
       FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
     }
     if (FixedOff > Offset) Offset = FixedOff;
   }
 
   // First assign frame offsets to stack objects that are used to spill
   // callee saved registers.
   if (StackGrowsDown) {
     for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) {
       // If the stack grows down, we need to add the size to find the lowest
       // address of the object.
       Offset += MFI->getObjectSize(i);
 
       unsigned Align = MFI->getObjectAlignment(i);
       // Adjust to alignment boundary
       Offset = (Offset+Align-1)/Align*Align;
 
       MFI->setObjectOffset(i, -Offset);        // Set the computed offset
     }
   } else {
     int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex;
     for (int i = MaxCSFI; i >= MinCSFI ; --i) {
       unsigned Align = MFI->getObjectAlignment(i);
       // Adjust to alignment boundary
       Offset = (Offset+Align-1)/Align*Align;
 
       MFI->setObjectOffset(i, Offset);
       Offset += MFI->getObjectSize(i);
     }
   }
 
   unsigned MaxAlign = MFI->getMaxAlignment();
 
   // Make sure the special register scavenging spill slot is closest to the
   // incoming stack pointer if a frame pointer is required and is closer
   // to the incoming rather than the final stack pointer.
   const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
   bool EarlyScavengingSlots = (TFI.hasFP(Fn) &&
                                TFI.isFPCloseToIncomingSP() &&
                                RegInfo->useFPForScavengingIndex(Fn) &&
                                !RegInfo->needsStackRealignment(Fn));
   if (RS && EarlyScavengingSlots) {
     SmallVector<int, 2> SFIs;
     RS->getScavengingFrameIndices(SFIs);
     for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
            IE = SFIs.end(); I != IE; ++I)
       AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign);
   }
 
   // FIXME: Once this is working, then enable flag will change to a target
   // check for whether the frame is large enough to want to use virtual
   // frame index registers. Functions which don't want/need this optimization
   // will continue to use the existing code path.
   if (MFI->getUseLocalStackAllocationBlock()) {
     unsigned Align = MFI->getLocalFrameMaxAlign();
 
     // Adjust to alignment boundary.
     Offset = (Offset + Align - 1) / Align * Align;
 
     DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
 
     // Resolve offsets for objects in the local block.
     for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
       std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
       int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
       DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
             FIOffset << "]\n");
       MFI->setObjectOffset(Entry.first, FIOffset);
     }
     // Allocate the local block
     Offset += MFI->getLocalFrameSize();
 
     MaxAlign = std::max(Align, MaxAlign);
   }
 
   // Make sure that the stack protector comes before the local variables on the
   // stack.
   SmallSet<int, 16> ProtectedObjs;
   if (MFI->getStackProtectorIndex() >= 0) {
     StackObjSet LargeArrayObjs;
     StackObjSet SmallArrayObjs;
     StackObjSet AddrOfObjs;
 
     AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown,
                       Offset, MaxAlign);
 
     // Assign large stack objects first.
     for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
       if (MFI->isObjectPreAllocated(i) &&
           MFI->getUseLocalStackAllocationBlock())
         continue;
       if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
         continue;
       if (RS && RS->isScavengingFrameIndex((int)i))
         continue;
       if (MFI->isDeadObjectIndex(i))
         continue;
       if (MFI->getStackProtectorIndex() == (int)i)
         continue;
 
       switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) {
       case StackProtector::SSPLK_None:
         continue;
       case StackProtector::SSPLK_SmallArray:
         SmallArrayObjs.insert(i);
         continue;
       case StackProtector::SSPLK_AddrOf:
         AddrOfObjs.insert(i);
         continue;
       case StackProtector::SSPLK_LargeArray:
         LargeArrayObjs.insert(i);
         continue;
       }
       llvm_unreachable("Unexpected SSPLayoutKind.");
     }
 
     AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
                           Offset, MaxAlign);
     AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
                           Offset, MaxAlign);
     AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
                           Offset, MaxAlign);
   }
 
   // Then assign frame offsets to stack objects that are not used to spill
   // callee saved registers.
   for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
     if (MFI->isObjectPreAllocated(i) &&
         MFI->getUseLocalStackAllocationBlock())
       continue;
     if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
       continue;
     if (RS && RS->isScavengingFrameIndex((int)i))
       continue;
     if (MFI->isDeadObjectIndex(i))
       continue;
     if (MFI->getStackProtectorIndex() == (int)i)
       continue;
     if (ProtectedObjs.count(i))
       continue;
 
     AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
   }
 
   // Make sure the special register scavenging spill slot is closest to the
   // stack pointer.
   if (RS && !EarlyScavengingSlots) {
     SmallVector<int, 2> SFIs;
     RS->getScavengingFrameIndices(SFIs);
     for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
            IE = SFIs.end(); I != IE; ++I)
       AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign);
   }
 
   if (!TFI.targetHandlesStackFrameRounding()) {
     // If we have reserved argument space for call sites in the function
     // immediately on entry to the current function, count it as part of the
     // overall stack size.
     if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
       Offset += MFI->getMaxCallFrameSize();
 
     // Round up the size to a multiple of the alignment.  If the function has
     // any calls or alloca's, align to the target's StackAlignment value to
     // ensure that the callee's frame or the alloca data is suitably aligned;
     // otherwise, for leaf functions, align to the TransientStackAlignment
     // value.
     unsigned StackAlign;
     if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
         (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
       StackAlign = TFI.getStackAlignment();
     else
       StackAlign = TFI.getTransientStackAlignment();
 
     // If the frame pointer is eliminated, all frame offsets will be relative to
     // SP not FP. Align to MaxAlign so this works.
     StackAlign = std::max(StackAlign, MaxAlign);
     unsigned AlignMask = StackAlign - 1;
     Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
   }
 
   // Update frame info to pretend that this is part of the stack...
   int64_t StackSize = Offset - LocalAreaOffset;
   MFI->setStackSize(StackSize);
   NumBytesStackSpace += StackSize;
 }
 
 /// insertPrologEpilogCode - Scan the function for modified callee saved
 /// registers, insert spill code for these callee saved registers, then add
 /// prolog and epilog code to the function.
 ///
 void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
   const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
 
   // Add prologue to the function...
   TFI.emitPrologue(Fn);
 
   // Add epilogue to restore the callee-save registers in each exiting block
   for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
     // If last instruction is a return instruction, add an epilogue
     if (!I->empty() && I->back().isReturn())
       TFI.emitEpilogue(Fn, *I);
   }
 
   // Emit additional code that is required to support segmented stacks, if
   // we've been asked for it.  This, when linked with a runtime with support
   // for segmented stacks (libgcc is one), will result in allocating stack
   // space in small chunks instead of one large contiguous block.
   if (Fn.shouldSplitStack())
     TFI.adjustForSegmentedStacks(Fn);
 
   // Emit additional code that is required to explicitly handle the stack in
   // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
   // approach is rather similar to that of Segmented Stacks, but it uses a
   // different conditional check and another BIF for allocating more stack
   // space.
   if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE)
     TFI.adjustForHiPEPrologue(Fn);
 }
 
 /// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
 /// register references and actual offsets.
 ///
 void PEI::replaceFrameIndices(MachineFunction &Fn) {
-  if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  if (!TFI.needsFrameIndexResolution(Fn)) return;
 
   // Store SPAdj at exit of a basic block.
   SmallVector<int, 8> SPState;
   SPState.resize(Fn.getNumBlockIDs());
   SmallPtrSet<MachineBasicBlock*, 8> Reachable;
 
   // Iterate over the reachable blocks in DFS order.
   for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
        DFI != DFE; ++DFI) {
     int SPAdj = 0;
     // Check the exit state of the DFS stack predecessor.
     if (DFI.getPathLength() >= 2) {
       MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
       assert(Reachable.count(StackPred) &&
              "DFS stack predecessor is already visited.\n");
       SPAdj = SPState[StackPred->getNumber()];
     }
     MachineBasicBlock *BB = *DFI;
     replaceFrameIndices(BB, Fn, SPAdj);
     SPState[BB->getNumber()] = SPAdj;
   }
 
   // Handle the unreachable blocks.
   for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
     if (Reachable.count(BB))
       // Already handled in DFS traversal.
       continue;
     int SPAdj = 0;
     replaceFrameIndices(BB, Fn, SPAdj);
   }
 }
 
 void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
                               int &SPAdj) {
   assert(Fn.getSubtarget().getRegisterInfo() &&
          "getRegisterInfo() must be implemented!");
   const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
   const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
   int FrameSetupOpcode   = TII.getCallFrameSetupOpcode();
   int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
 
   if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
 
   bool InsideCallSequence = false;
 
   for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
 
     if (I->getOpcode() == FrameSetupOpcode ||
         I->getOpcode() == FrameDestroyOpcode) {
       InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
       SPAdj += TII.getSPAdjust(I);
 
       MachineBasicBlock::iterator PrevI = BB->end();
       if (I != BB->begin()) PrevI = std::prev(I);
       TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
 
       // Visit the instructions created by eliminateCallFramePseudoInstr().
       if (PrevI == BB->end())
         I = BB->begin();     // The replaced instr was the first in the block.
       else
         I = std::next(PrevI);
       continue;
     }
 
-    // If we are looking at a call sequence, we need to keep track of
-    // the SP adjustment made by each instruction in the sequence.
-    // This includes both the frame setup/destroy pseudos (handled above),
-    // as well as other instructions that have side effects w.r.t the SP.
-    if (InsideCallSequence)
-      SPAdj += TII.getSPAdjust(I);
-
     MachineInstr *MI = I;
     bool DoIncr = true;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       if (!MI->getOperand(i).isFI())
         continue;
 
       // Frame indicies in debug values are encoded in a target independent
       // way with simply the frame index and offset rather than any
       // target-specific addressing mode.
       if (MI->isDebugValue()) {
         assert(i == 0 && "Frame indicies can only appear as the first "
                          "operand of a DBG_VALUE machine instruction");
         unsigned Reg;
         MachineOperand &Offset = MI->getOperand(1);
         Offset.setImm(Offset.getImm() +
                       TFI->getFrameIndexReference(
                           Fn, MI->getOperand(0).getIndex(), Reg));
         MI->getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
         continue;
       }
 
       // TODO: This code should be commoned with the code for
       // PATCHPOINT. There's no good reason for the difference in
       // implementation other than historical accident.  The only
       // remaining difference is the unconditional use of the stack
       // pointer as the base register.
       if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
         assert((!MI->isDebugValue() || i == 0) &&
                "Frame indicies can only appear as the first operand of a "
                "DBG_VALUE machine instruction");
         unsigned Reg;
         MachineOperand &Offset = MI->getOperand(i + 1);
         const unsigned refOffset =
           TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(),
                                             Reg);
 
         Offset.setImm(Offset.getImm() + refOffset);
         MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
         continue;
       }
 
       // Frame allocations are target independent. Simply swap the index with
       // the offset.
       if (MI->getOpcode() == TargetOpcode::FRAME_ALLOC) {
         assert(TFI->hasFP(Fn) && "frame alloc requires FP");
         MachineOperand &FI = MI->getOperand(i);
         unsigned Reg;
         int FrameOffset = TFI->getFrameIndexReference(Fn, FI.getIndex(), Reg);
         FI.ChangeToImmediate(FrameOffset);
         continue;
       }
 
       // Some instructions (e.g. inline asm instructions) can have
       // multiple frame indices and/or cause eliminateFrameIndex
       // to insert more than one instruction. We need the register
       // scavenger to go through all of these instructions so that
       // it can update its register information. We keep the
       // iterator at the point before insertion so that we can
       // revisit them in full.
       bool AtBeginning = (I == BB->begin());
       if (!AtBeginning) --I;
 
       // If this instruction has a FrameIndex operand, we need to
       // use that target machine register info object to eliminate
       // it.
       TRI.eliminateFrameIndex(MI, SPAdj, i,
                               FrameIndexVirtualScavenging ?  nullptr : RS);
 
       // Reset the iterator if we were at the beginning of the BB.
       if (AtBeginning) {
         I = BB->begin();
         DoIncr = false;
       }
 
       MI = nullptr;
       break;
     }
 
+    // If we are looking at a call sequence, we need to keep track of
+    // the SP adjustment made by each instruction in the sequence.
+    // This includes both the frame setup/destroy pseudos (handled above),
+    // as well as other instructions that have side effects w.r.t the SP.
+    // Note that this must come after eliminateFrameIndex, because 
+    // if I itself referred to a frame index, we shouldn't count its own
+    // adjustment.
+    if (MI && InsideCallSequence)
+      SPAdj += TII.getSPAdjust(MI);
+
     if (DoIncr && I != BB->end()) ++I;
 
     // Update register states.
     if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
   }
 }
 
 /// scavengeFrameVirtualRegs - Replace all frame index virtual registers
 /// with physical registers. Use the register scavenger to find an
 /// appropriate register to use.
 ///
 /// FIXME: Iterating over the instruction stream is unnecessary. We can simply
 /// iterate over the vreg use list, which at this point only contains machine
 /// operands for which eliminateFrameIndex need a new scratch reg.
 void
 PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) {
   // Run through the instructions and find any virtual registers.
   for (MachineFunction::iterator BB = Fn.begin(),
        E = Fn.end(); BB != E; ++BB) {
     RS->enterBasicBlock(BB);
 
     int SPAdj = 0;
 
     // The instruction stream may change in the loop, so check BB->end()
     // directly.
     for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
       // We might end up here again with a NULL iterator if we scavenged a
       // register for which we inserted spill code for definition by what was
       // originally the first instruction in BB.
       if (I == MachineBasicBlock::iterator(nullptr))
         I = BB->begin();
 
       MachineInstr *MI = I;
       MachineBasicBlock::iterator J = std::next(I);
       MachineBasicBlock::iterator P =
                          I == BB->begin() ? MachineBasicBlock::iterator(nullptr)
                                           : std::prev(I);
 
       // RS should process this instruction before we might scavenge at this
       // location. This is because we might be replacing a virtual register
       // defined by this instruction, and if so, registers killed by this
       // instruction are available, and defined registers are not.
       RS->forward(I);
 
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         if (MI->getOperand(i).isReg()) {
           MachineOperand &MO = MI->getOperand(i);
           unsigned Reg = MO.getReg();
           if (Reg == 0)
             continue;
           if (!TargetRegisterInfo::isVirtualRegister(Reg))
             continue;
 
           // When we first encounter a new virtual register, it
           // must be a definition.
           assert(MI->getOperand(i).isDef() &&
                  "frame index virtual missing def!");
           // Scavenge a new scratch register
           const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg);
           unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
 
           ++NumScavengedRegs;
 
           // Replace this reference to the virtual register with the
           // scratch register.
           assert (ScratchReg && "Missing scratch register!");
           MachineRegisterInfo &MRI = Fn.getRegInfo();
           Fn.getRegInfo().replaceRegWith(Reg, ScratchReg);
           
           // Make sure MRI now accounts this register as used.
           MRI.setPhysRegUsed(ScratchReg);
 
           // Because this instruction was processed by the RS before this
           // register was allocated, make sure that the RS now records the
           // register as being used.
           RS->setRegUsed(ScratchReg);
         }
       }
 
       // If the scavenger needed to use one of its spill slots, the
       // spill code will have been inserted in between I and J. This is a
       // problem because we need the spill code before I: Move I to just
       // prior to J.
       if (I != std::prev(J)) {
         BB->splice(J, BB, I);
 
         // Before we move I, we need to prepare the RS to visit I again.
         // Specifically, RS will assert if it sees uses of registers that
         // it believes are undefined. Because we have already processed
         // register kills in I, when it visits I again, it will believe that
         // those registers are undefined. To avoid this situation, unprocess
         // the instruction I.
         assert(RS->getCurrentPosition() == I &&
           "The register scavenger has an unexpected position");
         I = P;
         RS->unprocess(P);
       } else
         ++I;
     }
   }
 }
diff --git a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 1557d10238e9..e3f01912b872 100644
--- a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -1,44 +1,49 @@
 //===----- TargetFrameLoweringImpl.cpp - Implement target frame interface --==//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // Implements the layout of a stack frame on the target machine.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <cstdlib>
 using namespace llvm;
 
 TargetFrameLowering::~TargetFrameLowering() {
 }
 
 /// getFrameIndexOffset - Returns the displacement from the frame register to
 /// the stack frame of the specified index. This is the default implementation
 /// which is overridden for some targets.
 int TargetFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
                                              int FI) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   return MFI->getObjectOffset(FI) + MFI->getStackSize() -
     getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
 }
 
 int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                              int FI, unsigned &FrameReg) const {
   const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
 
   // By default, assume all frame indices are referenced via whatever
   // getFrameRegister() says. The target can override this if it's doing
   // something different.
   FrameReg = RI->getFrameRegister(MF);
   return getFrameIndexOffset(MF, FI);
 }
+
+bool TargetFrameLowering::needsFrameIndexResolution(
+    const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects();
+}
diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h
index 8bd5817e528f..219b64d18d1d 100644
--- a/contrib/llvm/lib/Target/X86/X86.h
+++ b/contrib/llvm/lib/Target/X86/X86.h
@@ -1,72 +1,77 @@
 //===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains the entry points for global functions defined in the x86
 // target library, as used by the LLVM JIT.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_X86_X86_H
 #define LLVM_LIB_TARGET_X86_X86_H
 
 #include "llvm/Support/CodeGen.h"
 
 namespace llvm {
 
 class FunctionPass;
 class ImmutablePass;
 class X86TargetMachine;
 
 /// createX86ISelDag - This pass converts a legalized DAG into a
 /// X86-specific DAG, ready for instruction scheduling.
 ///
 FunctionPass *createX86ISelDag(X86TargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
 /// createX86GlobalBaseRegPass - This pass initializes a global base
 /// register for PIC on x86-32.
 FunctionPass* createX86GlobalBaseRegPass();
 
 /// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
 /// to local-dynamic TLS variables so that the TLS base address for the module
 /// is only fetched once per execution path through the function.
 FunctionPass *createCleanupLocalDynamicTLSPass();
 
 /// createX86FloatingPointStackifierPass - This function returns a pass which
 /// converts floating point register references and pseudo instructions into
 /// floating point stack references and physical instructions.
 ///
 FunctionPass *createX86FloatingPointStackifierPass();
 
 /// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
 /// before each call to avoid transition penalty between functions encoded with
 /// AVX and SSE.
 FunctionPass *createX86IssueVZeroUpperPass();
 
 /// createX86EmitCodeToMemory - Returns a pass that converts a register
 /// allocated function into raw machine code in a dynamically
 /// allocated chunk of memory.
 ///
 FunctionPass *createEmitX86CodeToMemory();
 
 /// \brief Creates an X86-specific Target Transformation Info pass.
 ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM);
 
 /// createX86PadShortFunctions - Return a pass that pads short functions
 /// with NOOPs. This will prevent a stall when returning on the Atom.
 FunctionPass *createX86PadShortFunctions();
 /// createX86FixupLEAs - Return a a pass that selectively replaces
 /// certain instructions (like add, sub, inc, dec, some shifts,
 /// and some multiplies) by equivalent LEA instructions, in order
 /// to eliminate execution delays in some Atom processors.
 FunctionPass *createX86FixupLEAs();
 
+/// createX86CallFrameOptimization - Return a pass that optimizes
+/// the code-size of x86 call sequences. This is done by replacing
+/// esp-relative movs with pushes.
+FunctionPass *createX86CallFrameOptimization();
+
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
new file mode 100644
index 000000000000..fae489e77cc0
--- /dev/null
+++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -0,0 +1,400 @@
+//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that optimizes call sequences on x86.
+// Currently, it converts movs of function parameters onto the stack into 
+// pushes. This is beneficial for two main reasons:
+// 1) The push instruction encoding is much smaller than an esp-relative mov
+// 2) It is possible to push memory arguments directly. So, if the
+//    the transformation is preformed pre-reg-alloc, it can help relieve
+//    register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-cf-opt"
+
+cl::opt<bool> NoX86CFOpt("no-x86-call-frame-opt",
+              cl::desc("Avoid optimizing x86 call frames for size"),
+              cl::init(false), cl::Hidden);
+
+namespace {
+class X86CallFrameOptimization : public MachineFunctionPass {
+public:
+  X86CallFrameOptimization() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  bool shouldPerformTransformation(MachineFunction &MF);
+
+  bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I);
+
+  MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup,
+                                   unsigned Reg);
+
+  const char *getPassName() const override {
+    return "X86 Optimize Call Frame";
+  }
+
+  const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFL;
+  const MachineRegisterInfo *MRI;
+  static char ID;
+};
+
+char X86CallFrameOptimization::ID = 0;
+}
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+  return new X86CallFrameOptimization();
+}
+
+// This checks whether the transformation is legal and profitable
+bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) {
+  if (NoX86CFOpt.getValue())
+    return false;
+
+  // We currently only support call sequences where *all* parameters.
+  // are passed on the stack.
+  // No point in running this in 64-bit mode, since some arguments are
+  // passed in-register in all common calling conventions, so the pattern
+  // we're looking for will never match.
+  const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
+  if (STI.is64Bit())
+    return false;
+
+  // You would expect straight-line code between call-frame setup and
+  // call-frame destroy. You would be wrong. There are circumstances (e.g.
+  // CMOV_GR8 expansion of a select that feeds a function call!) where we can
+  // end up with the setup and the destroy in different basic blocks.
+  // This is bad, and breaks SP adjustment.
+  // So, check that all of the frames in the function are closed inside
+  // the same block, and, for good measure, that there are no nested frames.
+  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+  for (MachineBasicBlock &BB : MF) {
+    bool InsideFrameSequence = false;
+    for (MachineInstr &MI : BB) {
+      if (MI.getOpcode() == FrameSetupOpcode) {
+        if (InsideFrameSequence)
+          return false;
+        InsideFrameSequence = true;
+      }
+      else if (MI.getOpcode() == FrameDestroyOpcode) {
+        if (!InsideFrameSequence)
+          return false;
+        InsideFrameSequence = false;
+      }
+    }
+
+    if (InsideFrameSequence)
+      return false;
+  }
+
+  // Now that we know the transformation is legal, check if it is
+  // profitable.
+  // TODO: Add a heuristic that actually looks at the function,
+  //       and enable this for more cases.
+
+  // This transformation is always a win when we expected to have
+  // a reserved call frame. Under other circumstances, it may be either 
+  // a win or a loss, and requires a heuristic.
+  // For now, enable it only for the relatively clear win cases.
+  bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects();
+  if (CannotReserveFrame)
+    return true;
+
+  // For now, don't even try to evaluate the profitability when
+  // not optimizing for size.
+  AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+  bool OptForSize =
+    FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+    Attribute::OptimizeForSize) ||
+    FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+
+  if (!OptForSize)
+    return false;
+
+  // Stack re-alignment can make this unprofitable even in terms of size.
+  // As mentioned above, a better heuristic is needed. For now, don't do this
+  // when the required alignment is above 8. (4 would be the safe choice, but
+  // some experimentation showed 8 is generally good).
+  if (TFL->getStackAlignment() > 8)
+    return false;
+
+  return true;
+}
+
+bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget().getInstrInfo();
+  TFL = MF.getSubtarget().getFrameLowering();
+  MRI = &MF.getRegInfo();
+
+  if (!shouldPerformTransformation(MF))
+    return false;
+
+  int FrameSetupOpcode = TII->getCallFrameSetupOpcode();
+
+  bool Changed = false;
+
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+      if (I->getOpcode() == FrameSetupOpcode)
+        Changed |= adjustCallSequence(MF, *BB, I);
+
+  return Changed;
+}
+
+bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
+                                                MachineBasicBlock &MBB,
+                                                MachineBasicBlock::iterator I) {
+
+  // Check that this particular call sequence is amenable to the
+  // transformation.
+  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
+                                       MF.getSubtarget().getRegisterInfo());
+  unsigned StackPtr = RegInfo.getStackRegister();
+  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+  // We expect to enter this at the beginning of a call sequence
+  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+  MachineBasicBlock::iterator FrameSetup = I++;
+
+  
+  // For globals in PIC mode, we can have some LEAs here.
+  // Ignore them, they don't bother us.
+  // TODO: Extend this to something that covers more cases.
+  while (I->getOpcode() == X86::LEA32r)
+    ++I;
+  
+  // We expect a copy instruction here.
+  // TODO: The copy instruction is a lowering artifact.
+  //       We should also support a copy-less version, where the stack
+  //       pointer is used directly.
+  if (!I->isCopy() || !I->getOperand(0).isReg())
+    return false;
+  MachineBasicBlock::iterator SPCopy = I++;
+  StackPtr = SPCopy->getOperand(0).getReg();
+
+  // Scan the call setup sequence for the pattern we're looking for.
+  // We only handle a simple case - a sequence of MOV32mi or MOV32mr
+  // instructions, that push a sequence of 32-bit values onto the stack, with
+  // no gaps between them.
+  SmallVector<MachineInstr*, 4> MovVector(4, nullptr);
+  unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4;
+  if (MaxAdjust > 4)
+    MovVector.resize(MaxAdjust, nullptr);
+
+  do {
+    int Opcode = I->getOpcode();
+    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
+      break;
+
+    // We only want movs of the form:
+    // movl imm/r32, k(%esp)
+    // If we run into something else, bail.
+    // Note that AddrBaseReg may, counter to its name, not be a register,
+    // but rather a frame index.
+    // TODO: Support the fi case. This should probably work now that we
+    // have the infrastructure to track the stack pointer within a call
+    // sequence.
+    if (!I->getOperand(X86::AddrBaseReg).isReg() ||
+        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
+        !I->getOperand(X86::AddrScaleAmt).isImm() ||
+        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
+        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
+        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
+        !I->getOperand(X86::AddrDisp).isImm())
+      return false;
+
+    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
+    assert(StackDisp >= 0 && "Negative stack displacement when passing parameters");
+
+    // We really don't want to consider the unaligned case.
+    if (StackDisp % 4)
+      return false;
+    StackDisp /= 4;
+
+    assert((size_t)StackDisp < MovVector.size() &&
+      "Function call has more parameters than the stack is adjusted for.");
+
+    // If the same stack slot is being filled twice, something's fishy.
+    if (MovVector[StackDisp] != nullptr)
+      return false;
+    MovVector[StackDisp] = I;
+
+    ++I;
+  } while (I != MBB.end());
+
+  // We now expect the end of the sequence - a call and a stack adjust.
+  if (I == MBB.end())
+    return false;
+
+  // For PCrel calls, we expect an additional COPY of the basereg.
+  // If we find one, skip it.
+  if (I->isCopy()) {
+    if (I->getOperand(1).getReg() ==
+      MF.getInfo<X86MachineFunctionInfo>()->getGlobalBaseReg())
+      ++I;
+    else
+      return false;
+  }
+
+  if (!I->isCall())
+    return false;
+  MachineBasicBlock::iterator Call = I;
+  if ((++I)->getOpcode() != FrameDestroyOpcode)
+    return false;
+
+  // Now, go through the vector, and see that we don't have any gaps,
+  // but only a series of 32-bit MOVs.
+  
+  int64_t ExpectedDist = 0;
+  auto MMI = MovVector.begin(), MME = MovVector.end();
+  for (; MMI != MME; ++MMI, ExpectedDist += 4)
+    if (*MMI == nullptr)
+      break;
+  
+  // If the call had no parameters, do nothing
+  if (!ExpectedDist)
+    return false;
+
+  // We are either at the last parameter, or a gap. 
+  // Make sure it's not a gap
+  for (; MMI != MME; ++MMI)
+    if (*MMI != nullptr)
+      return false;
+
+  // Ok, we can in fact do the transformation for this call.
+  // Do not remove the FrameSetup instruction, but adjust the parameters.
+  // PEI will end up finalizing the handling of this.
+  FrameSetup->getOperand(1).setImm(ExpectedDist);
+
+  DebugLoc DL = I->getDebugLoc();
+  // Now, iterate through the vector in reverse order, and replace the movs
+  // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to 
+  // replace uses.
+  for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
+    MachineBasicBlock::iterator MOV = *MovVector[Idx];
+    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+    if (MOV->getOpcode() == X86::MOV32mi) {
+      unsigned PushOpcode = X86::PUSHi32;
+      // If the operand is a small (8-bit) immediate, we can use a
+      // PUSH instruction with a shorter encoding.
+      // Note that isImm() may fail even though this is a MOVmi, because
+      // the operand can also be a symbol.
+      if (PushOp.isImm()) {
+        int64_t Val = PushOp.getImm();
+        if (isInt<8>(Val))
+          PushOpcode = X86::PUSH32i8;
+      }
+      BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
+    } else {
+      unsigned int Reg = PushOp.getReg();
+
+      // If PUSHrmm is not slow on this target, try to fold the source of the
+      // push into the instruction.
+      const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
+      bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
+
+      // Check that this is legal to fold. Right now, we're extremely
+      // conservative about that.
+      MachineInstr *DefMov = nullptr;
+      if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
+        MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm));
+
+        unsigned NumOps = DefMov->getDesc().getNumOperands();
+        for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
+          Push->addOperand(DefMov->getOperand(i));
+
+        DefMov->eraseFromParent();
+      } else {
+        BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr();
+      }
+    }
+
+    MBB.erase(MOV);
+  }
+
+  // The stack-pointer copy is no longer used in the call sequences.
+  // There should not be any other users, but we can't commit to that, so:
+  if (MRI->use_empty(SPCopy->getOperand(0).getReg()))
+    SPCopy->eraseFromParent();
+
+  // Once we've done this, we need to make sure PEI doesn't assume a reserved
+  // frame.
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  FuncInfo->setHasPushSequences(true);
+
+  return true;
+}
+
+MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
+    MachineBasicBlock::iterator FrameSetup, unsigned Reg) {
+  // Do an extremely restricted form of load folding.
+  // ISel will often create patterns like:
+  // movl    4(%edi), %eax
+  // movl    8(%edi), %ecx
+  // movl    12(%edi), %edx
+  // movl    %edx, 8(%esp)
+  // movl    %ecx, 4(%esp)
+  // movl    %eax, (%esp)
+  // call
+  // Get rid of those with prejudice.
+  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    return nullptr;
+
+  // Make sure this is the only use of Reg.
+  if (!MRI->hasOneNonDBGUse(Reg))
+    return nullptr;
+
+  MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg);
+
+  // Make sure the def is a MOV from memory.
+  // If the def is an another block, give up.
+  if (DefMI->getOpcode() != X86::MOV32rm ||
+      DefMI->getParent() != FrameSetup->getParent())
+    return nullptr;
+
+  // Be careful with movs that load from a stack slot, since it may get
+  // resolved incorrectly.
+  // TODO: Again, we already have the infrastructure, so this should work.
+  if (!DefMI->getOperand(1).isReg())
+    return nullptr;
+
+  // Now, make sure everything else up until the ADJCALLSTACK is a sequence
+  // of MOVs. To be less conservative would require duplicating a lot of the
+  // logic from PeepholeOptimizer.
+  // FIXME: A possibly better approach would be to teach the PeepholeOptimizer
+  // to be smarter about folding into pushes. 
+  for (auto I = DefMI; I != FrameSetup; ++I)
+    if (I->getOpcode() != X86::MOV32rm)
+      return nullptr;
+
+  return DefMI;
+}
diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
index 5d71eac7c05a..688a5447b8e6 100644
--- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp
@@ -1,3352 +1,3352 @@
 //===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the X86-specific support for the FastISel class. Much
 // of the target-specific code is generated by tablegen in the file
 // X86GenFastISel.inc, which is #included here.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
 #include "X86CallingConv.h"
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 namespace {
 
 class X86FastISel final : public FastISel {
   /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const X86Subtarget *Subtarget;
 
   /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
   /// floating point ops.
   /// When SSE is available, use it for f32 operations.
   /// When SSE2 is available, use it for f64 operations.
   bool X86ScalarSSEf64;
   bool X86ScalarSSEf32;
 
 public:
   explicit X86FastISel(FunctionLoweringInfo &funcInfo,
                        const TargetLibraryInfo *libInfo)
     : FastISel(funcInfo, libInfo) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
     X86ScalarSSEf64 = Subtarget->hasSSE2();
     X86ScalarSSEf32 = Subtarget->hasSSE1();
   }
 
   bool fastSelectInstruction(const Instruction *I) override;
 
   /// \brief The specified machine instr operand is a vreg, and that
   /// vreg is being provided by the specified load instruction.  If possible,
   /// try to fold the load as an operand to the instruction, returning true if
   /// possible.
   bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                            const LoadInst *LI) override;
 
   bool fastLowerArguments() override;
   bool fastLowerCall(CallLoweringInfo &CLI) override;
   bool fastLowerIntrinsicCall(const IntrinsicInst *II) override;
 
 #include "X86GenFastISel.inc"
 
 private:
   bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT);
 
   bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO,
                        unsigned &ResultReg);
 
   bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
                         MachineMemOperand *MMO = nullptr, bool Aligned = false);
   bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
                         const X86AddressMode &AM,
                         MachineMemOperand *MMO = nullptr, bool Aligned = false);
 
   bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
                          unsigned &ResultReg);
 
   bool X86SelectAddress(const Value *V, X86AddressMode &AM);
   bool X86SelectCallAddress(const Value *V, X86AddressMode &AM);
 
   bool X86SelectLoad(const Instruction *I);
 
   bool X86SelectStore(const Instruction *I);
 
   bool X86SelectRet(const Instruction *I);
 
   bool X86SelectCmp(const Instruction *I);
 
   bool X86SelectZExt(const Instruction *I);
 
   bool X86SelectBranch(const Instruction *I);
 
   bool X86SelectShift(const Instruction *I);
 
   bool X86SelectDivRem(const Instruction *I);
 
   bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I);
 
   bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I);
 
   bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I);
 
   bool X86SelectSelect(const Instruction *I);
 
   bool X86SelectTrunc(const Instruction *I);
 
   bool X86SelectFPExt(const Instruction *I);
   bool X86SelectFPTrunc(const Instruction *I);
 
   const X86InstrInfo *getInstrInfo() const {
     return getTargetMachine()->getSubtargetImpl()->getInstrInfo();
   }
   const X86TargetMachine *getTargetMachine() const {
     return static_cast<const X86TargetMachine *>(&TM);
   }
 
   bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
 
   unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT);
   unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT);
   unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT);
   unsigned fastMaterializeConstant(const Constant *C) override;
 
   unsigned fastMaterializeAlloca(const AllocaInst *C) override;
 
   unsigned fastMaterializeFloatZero(const ConstantFP *CF) override;
 
   /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
   /// computed in an SSE register, not on the X87 floating point stack.
   bool isScalarFPTypeInSSEReg(EVT VT) const {
     return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
       (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
   }
 
   bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false);
 
   bool IsMemcpySmall(uint64_t Len);
 
   bool TryEmitSmallMemcpy(X86AddressMode DestAM,
                           X86AddressMode SrcAM, uint64_t Len);
 
   bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
                             const Value *Cond);
 };
 
 } // end anonymous namespace.
 
 static std::pair<X86::CondCode, bool>
 getX86ConditionCode(CmpInst::Predicate Predicate) {
   X86::CondCode CC = X86::COND_INVALID;
   bool NeedSwap = false;
   switch (Predicate) {
   default: break;
   // Floating-point Predicates
   case CmpInst::FCMP_UEQ: CC = X86::COND_E;       break;
   case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through
   case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
   case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through
   case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
   case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through
   case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
   case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through
   case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
   case CmpInst::FCMP_ONE: CC = X86::COND_NE;      break;
   case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
   case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
   case CmpInst::FCMP_OEQ: // fall-through
   case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
 
   // Integer Predicates
   case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
   case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
   case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
   case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
   case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
   case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
   case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
   case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
   case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
   case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
   }
 
   return std::make_pair(CC, NeedSwap);
 }
 
 static std::pair<unsigned, bool>
 getX86SSEConditionCode(CmpInst::Predicate Predicate) {
   unsigned CC;
   bool NeedSwap = false;
 
   // SSE Condition code mapping:
   //  0 - EQ
   //  1 - LT
   //  2 - LE
   //  3 - UNORD
   //  4 - NEQ
   //  5 - NLT
   //  6 - NLE
   //  7 - ORD
   switch (Predicate) {
   default: llvm_unreachable("Unexpected predicate");
   case CmpInst::FCMP_OEQ: CC = 0;          break;
   case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through
   case CmpInst::FCMP_OLT: CC = 1;          break;
   case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through
   case CmpInst::FCMP_OLE: CC = 2;          break;
   case CmpInst::FCMP_UNO: CC = 3;          break;
   case CmpInst::FCMP_UNE: CC = 4;          break;
   case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through
   case CmpInst::FCMP_UGE: CC = 5;          break;
   case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through
   case CmpInst::FCMP_UGT: CC = 6;          break;
   case CmpInst::FCMP_ORD: CC = 7;          break;
   case CmpInst::FCMP_UEQ:
   case CmpInst::FCMP_ONE: CC = 8;          break;
   }
 
   return std::make_pair(CC, NeedSwap);
 }
 
 /// \brief Check if it is possible to fold the condition from the XALU intrinsic
 /// into the user. The condition code will only be updated on success.
 bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I,
                                        const Value *Cond) {
   if (!isa<ExtractValueInst>(Cond))
     return false;
 
   const auto *EV = cast<ExtractValueInst>(Cond);
   if (!isa<IntrinsicInst>(EV->getAggregateOperand()))
     return false;
 
   const auto *II = cast<IntrinsicInst>(EV->getAggregateOperand());
   MVT RetVT;
   const Function *Callee = II->getCalledFunction();
   Type *RetTy =
     cast<StructType>(Callee->getReturnType())->getTypeAtIndex(0U);
   if (!isTypeLegal(RetTy, RetVT))
     return false;
 
   if (RetVT != MVT::i32 && RetVT != MVT::i64)
     return false;
 
   X86::CondCode TmpCC;
   switch (II->getIntrinsicID()) {
   default: return false;
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
   case Intrinsic::smul_with_overflow:
   case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break;
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break;
   }
 
   // Check if both instructions are in the same basic block.
   if (II->getParent() != I->getParent())
     return false;
 
   // Make sure nothing is in the way
   BasicBlock::const_iterator Start = I;
   BasicBlock::const_iterator End = II;
   for (auto Itr = std::prev(Start); Itr != End; --Itr) {
     // We only expect extractvalue instructions between the intrinsic and the
     // instruction to be selected.
     if (!isa<ExtractValueInst>(Itr))
       return false;
 
     // Check that the extractvalue operand comes from the intrinsic.
     const auto *EVI = cast<ExtractValueInst>(Itr);
     if (EVI->getAggregateOperand() != II)
       return false;
   }
 
   CC = TmpCC;
   return true;
 }
 
 bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) {
   EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true);
   if (evt == MVT::Other || !evt.isSimple())
     // Unhandled type. Halt "fast" selection and bail.
     return false;
 
   VT = evt.getSimpleVT();
   // For now, require SSE/SSE2 for performing floating-point operations,
   // since x87 requires additional work.
   if (VT == MVT::f64 && !X86ScalarSSEf64)
     return false;
   if (VT == MVT::f32 && !X86ScalarSSEf32)
     return false;
   // Similarly, no f80 support yet.
   if (VT == MVT::f80)
     return false;
   // We only handle legal types. For example, on x86-32 the instruction
   // selector contains all of the 64-bit instructions from x86-64,
   // under the assumption that i64 won't be used if the target doesn't
   // support it.
   return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
 }
 
 #include "X86GenCallingConv.inc"
 
 /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
 /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
 /// Return true and the result register by reference if it is possible.
 bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
                                   MachineMemOperand *MMO, unsigned &ResultReg) {
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = nullptr;
   switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
   case MVT::i8:
     Opc = X86::MOV8rm;
     RC  = &X86::GR8RegClass;
     break;
   case MVT::i16:
     Opc = X86::MOV16rm;
     RC  = &X86::GR16RegClass;
     break;
   case MVT::i32:
     Opc = X86::MOV32rm;
     RC  = &X86::GR32RegClass;
     break;
   case MVT::i64:
     // Must be in x86-64 mode.
     Opc = X86::MOV64rm;
     RC  = &X86::GR64RegClass;
     break;
   case MVT::f32:
     if (X86ScalarSSEf32) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
       RC  = &X86::FR32RegClass;
     } else {
       Opc = X86::LD_Fp32m;
       RC  = &X86::RFP32RegClass;
     }
     break;
   case MVT::f64:
     if (X86ScalarSSEf64) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
       RC  = &X86::FR64RegClass;
     } else {
       Opc = X86::LD_Fp64m;
       RC  = &X86::RFP64RegClass;
     }
     break;
   case MVT::f80:
     // No f80 support yet.
     return false;
   }
 
   ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB =
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
   addFullAddress(MIB, AM);
   if (MMO)
     MIB->addMemOperand(*FuncInfo.MF, MMO);
   return true;
 }
 
 /// X86FastEmitStore - Emit a machine instruction to store a value Val of
 /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
 /// and a displacement offset, or a GlobalAddress,
 /// i.e. V. Return true if it is possible.
 bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
                                    const X86AddressMode &AM,
                                    MachineMemOperand *MMO, bool Aligned) {
   // Get opcode and regclass of the output for the given store instruction.
   unsigned Opc = 0;
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f80: // No f80 support yet.
   default: return false;
   case MVT::i1: {
     // Mask out all but lowest bit.
     unsigned AndResult = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(X86::AND8ri), AndResult)
       .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
     ValReg = AndResult;
   }
   // FALLTHROUGH, handling i1 as i8.
   case MVT::i8:  Opc = X86::MOV8mr;  break;
   case MVT::i16: Opc = X86::MOV16mr; break;
   case MVT::i32: Opc = X86::MOV32mr; break;
   case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
   case MVT::f32:
     Opc = X86ScalarSSEf32 ?
           (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m;
     break;
   case MVT::f64:
     Opc = X86ScalarSSEf64 ?
           (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
     break;
   case MVT::v4f32:
     if (Aligned)
       Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
     else
       Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
     break;
   case MVT::v2f64:
     if (Aligned)
       Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
     else
       Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
     break;
   case MVT::v4i32:
   case MVT::v2i64:
   case MVT::v8i16:
   case MVT::v16i8:
     if (Aligned)
       Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
     else
       Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
     break;
   }
 
   MachineInstrBuilder MIB =
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
   addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill));
   if (MMO)
     MIB->addMemOperand(*FuncInfo.MF, MMO);
 
   return true;
 }
 
 bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
                                    const X86AddressMode &AM,
                                    MachineMemOperand *MMO, bool Aligned) {
   // Handle 'null' like i32/i64 0.
   if (isa<ConstantPointerNull>(Val))
     Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext()));
 
   // If this is a store of a simple constant, fold the constant into the store.
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
     unsigned Opc = 0;
     bool Signed = true;
     switch (VT.getSimpleVT().SimpleTy) {
     default: break;
     case MVT::i1:  Signed = false;     // FALLTHROUGH to handle as i8.
     case MVT::i8:  Opc = X86::MOV8mi;  break;
     case MVT::i16: Opc = X86::MOV16mi; break;
     case MVT::i32: Opc = X86::MOV32mi; break;
     case MVT::i64:
       // Must be a 32-bit sign extended value.
       if (isInt<32>(CI->getSExtValue()))
         Opc = X86::MOV64mi32;
       break;
     }
 
     if (Opc) {
       MachineInstrBuilder MIB =
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc));
       addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue()
                                             : CI->getZExtValue());
       if (MMO)
         MIB->addMemOperand(*FuncInfo.MF, MMO);
       return true;
     }
   }
 
   unsigned ValReg = getRegForValue(Val);
   if (ValReg == 0)
     return false;
 
   bool ValKill = hasTrivialKill(Val);
   return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned);
 }
 
 /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
 /// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
 /// ISD::SIGN_EXTEND).
 bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
                                     unsigned Src, EVT SrcVT,
                                     unsigned &ResultReg) {
   unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
                            Src, /*TODO: Kill=*/false);
   if (RR == 0)
     return false;
 
   ResultReg = RR;
   return true;
 }
 
 bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
   // Handle constant address.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
     // Can't handle alternate code models yet.
     if (TM.getCodeModel() != CodeModel::Small)
       return false;
 
     // Can't handle TLS yet.
     if (GV->isThreadLocal())
       return false;
 
     // RIP-relative addresses can't have additional register operands, so if
     // we've already folded stuff into the addressing mode, just force the
     // global value into its own register, which we can use as the basereg.
     if (!Subtarget->isPICStyleRIPRel() ||
         (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
       // Okay, we've committed to selecting this global. Set up the address.
       AM.GV = GV;
 
       // Allow the subtarget to classify the global.
       unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
 
       // If this reference is relative to the pic base, set it now.
       if (isGlobalRelativeToPICBase(GVFlags)) {
         // FIXME: How do we know Base.Reg is free??
         AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
       }
 
       // Unless the ABI requires an extra load, return a direct reference to
       // the global.
       if (!isGlobalStubReference(GVFlags)) {
         if (Subtarget->isPICStyleRIPRel()) {
           // Use rip-relative addressing if we can.  Above we verified that the
           // base and index registers are unused.
           assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
           AM.Base.Reg = X86::RIP;
         }
         AM.GVOpFlags = GVFlags;
         return true;
       }
 
       // Ok, we need to do a load from a stub.  If we've already loaded from
       // this stub, reuse the loaded pointer, otherwise emit the load now.
       DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
       unsigned LoadReg;
       if (I != LocalValueMap.end() && I->second != 0) {
         LoadReg = I->second;
       } else {
         // Issue load from stub.
         unsigned Opc = 0;
         const TargetRegisterClass *RC = nullptr;
         X86AddressMode StubAM;
         StubAM.Base.Reg = AM.Base.Reg;
         StubAM.GV = GV;
         StubAM.GVOpFlags = GVFlags;
 
         // Prepare for inserting code in the local-value area.
         SavePoint SaveInsertPt = enterLocalValueArea();
 
         if (TLI.getPointerTy() == MVT::i64) {
           Opc = X86::MOV64rm;
           RC  = &X86::GR64RegClass;
 
           if (Subtarget->isPICStyleRIPRel())
             StubAM.Base.Reg = X86::RIP;
         } else {
           Opc = X86::MOV32rm;
           RC  = &X86::GR32RegClass;
         }
 
         LoadReg = createResultReg(RC);
         MachineInstrBuilder LoadMI =
           BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg);
         addFullAddress(LoadMI, StubAM);
 
         // Ok, back to normal mode.
         leaveLocalValueArea(SaveInsertPt);
 
         // Prevent loading GV stub multiple times in same MBB.
         LocalValueMap[V] = LoadReg;
       }
 
       // Now construct the final address. Note that the Disp, Scale,
       // and Index values may already be set here.
       AM.Base.Reg = LoadReg;
       AM.GV = nullptr;
       return true;
     }
   }
 
   // If all else fails, try to materialize the value in a register.
   if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
     if (AM.Base.Reg == 0) {
       AM.Base.Reg = getRegForValue(V);
       return AM.Base.Reg != 0;
     }
     if (AM.IndexReg == 0) {
       assert(AM.Scale == 1 && "Scale with no index!");
       AM.IndexReg = getRegForValue(V);
       return AM.IndexReg != 0;
     }
   }
 
   return false;
 }
 
 /// X86SelectAddress - Attempt to fill in an address from the given value.
 ///
 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
   SmallVector<const Value *, 32> GEPs;
 redo_gep:
   const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(V)) {
     // Don't walk into other basic blocks; it's possible we haven't
     // visited them yet, so the instructions may not yet be assigned
     // virtual registers.
     if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(V)) ||
         FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
       Opcode = I->getOpcode();
       U = I;
     }
   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
     Opcode = C->getOpcode();
     U = C;
   }
 
   if (PointerType *Ty = dyn_cast<PointerType>(V->getType()))
     if (Ty->getAddressSpace() > 255)
       // Fast instruction selection doesn't support the special
       // address spaces.
       return false;
 
   switch (Opcode) {
   default: break;
   case Instruction::BitCast:
     // Look past bitcasts.
     return X86SelectAddress(U->getOperand(0), AM);
 
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs.
     if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
       return X86SelectAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints.
     if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
       return X86SelectAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::Alloca: {
     // Do static allocas.
     const AllocaInst *A = cast<AllocaInst>(V);
     DenseMap<const AllocaInst *, int>::iterator SI =
       FuncInfo.StaticAllocaMap.find(A);
     if (SI != FuncInfo.StaticAllocaMap.end()) {
       AM.BaseType = X86AddressMode::FrameIndexBase;
       AM.Base.FrameIndex = SI->second;
       return true;
     }
     break;
   }
 
   case Instruction::Add: {
     // Adds of constants are common and easy enough.
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
       uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
       // They have to fit in the 32-bit signed displacement field though.
       if (isInt<32>(Disp)) {
         AM.Disp = (uint32_t)Disp;
         return X86SelectAddress(U->getOperand(0), AM);
       }
     }
     break;
   }
 
   case Instruction::GetElementPtr: {
     X86AddressMode SavedAM = AM;
 
     // Pattern-match simple GEPs.
     uint64_t Disp = (int32_t)AM.Disp;
     unsigned IndexReg = AM.IndexReg;
     unsigned Scale = AM.Scale;
     gep_type_iterator GTI = gep_type_begin(U);
     // Iterate through the indices, folding what we can. Constants can be
     // folded, and one dynamic index can be handled, if the scale is supported.
     for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end();
          i != e; ++i, ++GTI) {
       const Value *Op = *i;
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         const StructLayout *SL = DL.getStructLayout(STy);
         Disp += SL->getElementOffset(cast<ConstantInt>(Op)->getZExtValue());
         continue;
       }
 
       // A array/variable index is always of the form i*S where S is the
       // constant scale size.  See if we can push the scale into immediates.
       uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType());
       for (;;) {
         if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
           // Constant-offset addressing.
           Disp += CI->getSExtValue() * S;
           break;
         }
         if (canFoldAddIntoGEP(U, Op)) {
           // A compatible add with a constant operand. Fold the constant.
           ConstantInt *CI =
             cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
           Disp += CI->getSExtValue() * S;
           // Iterate on the other operand.
           Op = cast<AddOperator>(Op)->getOperand(0);
           continue;
         }
         if (IndexReg == 0 &&
             (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
             (S == 1 || S == 2 || S == 4 || S == 8)) {
           // Scaled-index addressing.
           Scale = S;
           IndexReg = getRegForGEPIndex(Op).first;
           if (IndexReg == 0)
             return false;
           break;
         }
         // Unsupported.
         goto unsupported_gep;
       }
     }
 
     // Check for displacement overflow.
     if (!isInt<32>(Disp))
       break;
 
     AM.IndexReg = IndexReg;
     AM.Scale = Scale;
     AM.Disp = (uint32_t)Disp;
     GEPs.push_back(V);
 
     if (const GetElementPtrInst *GEP =
           dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
       // Ok, the GEP indices were covered by constant-offset and scaled-index
       // addressing. Update the address state and move on to examining the base.
       V = GEP;
       goto redo_gep;
     } else if (X86SelectAddress(U->getOperand(0), AM)) {
       return true;
     }
 
     // If we couldn't merge the gep value into this addr mode, revert back to
     // our address and just match the value instead of completely failing.
     AM = SavedAM;
 
     for (SmallVectorImpl<const Value *>::reverse_iterator
            I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
       if (handleConstantAddresses(*I, AM))
         return true;
 
     return false;
   unsupported_gep:
     // Ok, the GEP indices weren't all covered.
     break;
   }
   }
 
   return handleConstantAddresses(V, AM);
 }
 
 /// X86SelectCallAddress - Attempt to fill in an address from the given value.
 ///
 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
   const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   const Instruction *I = dyn_cast<Instruction>(V);
   // Record if the value is defined in the same basic block.
   //
   // This information is crucial to know whether or not folding an
   // operand is valid.
   // Indeed, FastISel generates or reuses a virtual register for all
   // operands of all instructions it selects. Obviously, the definition and
   // its uses must use the same virtual register otherwise the produced
   // code is incorrect.
   // Before instruction selection, FunctionLoweringInfo::set sets the virtual
   // registers for values that are alive across basic blocks. This ensures
   // that the values are consistently set between across basic block, even
   // if different instruction selection mechanisms are used (e.g., a mix of
   // SDISel and FastISel).
   // For values local to a basic block, the instruction selection process
   // generates these virtual registers with whatever method is appropriate
   // for its needs. In particular, FastISel and SDISel do not share the way
   // local virtual registers are set.
   // Therefore, this is impossible (or at least unsafe) to share values
   // between basic blocks unless they use the same instruction selection
   // method, which is not guarantee for X86.
   // Moreover, things like hasOneUse could not be used accurately, if we
   // allow to reference values across basic blocks whereas they are not
   // alive across basic blocks initially.
   bool InMBB = true;
   if (I) {
     Opcode = I->getOpcode();
     U = I;
     InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
     Opcode = C->getOpcode();
     U = C;
   }
 
   switch (Opcode) {
   default: break;
   case Instruction::BitCast:
     // Look past bitcasts if its operand is in the same BB.
     if (InMBB)
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::IntToPtr:
     // Look past no-op inttoptrs if its operand is in the same BB.
     if (InMBB &&
         TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::PtrToInt:
     // Look past no-op ptrtoints if its operand is in the same BB.
     if (InMBB &&
         TLI.getValueType(U->getType()) == TLI.getPointerTy())
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
   }
 
   // Handle constant address.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
     // Can't handle alternate code models yet.
     if (TM.getCodeModel() != CodeModel::Small)
       return false;
 
     // RIP-relative addresses can't have additional register operands.
     if (Subtarget->isPICStyleRIPRel() &&
         (AM.Base.Reg != 0 || AM.IndexReg != 0))
       return false;
 
     // Can't handle DLL Import.
     if (GV->hasDLLImportStorageClass())
       return false;
 
     // Can't handle TLS.
     if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
       if (GVar->isThreadLocal())
         return false;
 
     // Okay, we've committed to selecting this global. Set up the basic address.
     AM.GV = GV;
 
     // No ABI requires an extra load for anything other than DLLImport, which
     // we rejected above. Return a direct reference to the global.
     if (Subtarget->isPICStyleRIPRel()) {
       // Use rip-relative addressing if we can.  Above we verified that the
       // base and index registers are unused.
       assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
       AM.Base.Reg = X86::RIP;
     } else if (Subtarget->isPICStyleStubPIC()) {
       AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
     } else if (Subtarget->isPICStyleGOT()) {
       AM.GVOpFlags = X86II::MO_GOTOFF;
     }
 
     return true;
   }
 
   // If all else fails, try to materialize the value in a register.
   if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
     if (AM.Base.Reg == 0) {
       AM.Base.Reg = getRegForValue(V);
       return AM.Base.Reg != 0;
     }
     if (AM.IndexReg == 0) {
       assert(AM.Scale == 1 && "Scale with no index!");
       AM.IndexReg = getRegForValue(V);
       return AM.IndexReg != 0;
     }
   }
 
   return false;
 }
 
 
 /// X86SelectStore - Select and emit code to implement store instructions.
 bool X86FastISel::X86SelectStore(const Instruction *I) {
   // Atomic stores need special handling.
   const StoreInst *S = cast<StoreInst>(I);
 
   if (S->isAtomic())
     return false;
 
   const Value *Val = S->getValueOperand();
   const Value *Ptr = S->getPointerOperand();
 
   MVT VT;
   if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
     return false;
 
   unsigned Alignment = S->getAlignment();
   unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
   if (Alignment == 0) // Ensure that codegen never sees alignment 0
     Alignment = ABIAlignment;
   bool Aligned = Alignment >= ABIAlignment;
 
   X86AddressMode AM;
   if (!X86SelectAddress(Ptr, AM))
     return false;
 
   return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned);
 }
 
 /// X86SelectRet - Select and emit code to implement ret instructions.
 bool X86FastISel::X86SelectRet(const Instruction *I) {
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
   const X86MachineFunctionInfo *X86MFInfo =
       FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
 
   if (!FuncInfo.CanLowerReturn)
     return false;
 
   CallingConv::ID CC = F.getCallingConv();
   if (CC != CallingConv::C &&
       CC != CallingConv::Fast &&
       CC != CallingConv::X86_FastCall &&
       CC != CallingConv::X86_64_SysV)
     return false;
 
   if (Subtarget->isCallingConvWin64(CC))
     return false;
 
   // Don't handle popping bytes on return for now.
   if (X86MFInfo->getBytesToPopOnReturn() != 0)
     return false;
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
   // tail call optimization. Fastisel doesn't know how to do that.
   if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
     return false;
 
   // Let SDISel handle vararg functions.
   if (F.isVarArg())
     return false;
 
   // Build a list of return value registers.
   SmallVector<unsigned, 4> RetRegs;
 
   if (Ret->getNumOperands() > 0) {
     SmallVector<ISD::OutputArg, 4> Outs;
     GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
 
     // Analyze operands of the call, assigning locations to each operand.
     SmallVector<CCValAssign, 16> ValLocs;
     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext());
     CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
     const Value *RV = Ret->getOperand(0);
     unsigned Reg = getRegForValue(RV);
     if (Reg == 0)
       return false;
 
     // Only handle a single return value for now.
     if (ValLocs.size() != 1)
       return false;
 
     CCValAssign &VA = ValLocs[0];
 
     // Don't bother handling odd stuff for now.
     if (VA.getLocInfo() != CCValAssign::Full)
       return false;
     // Only handle register returns for now.
     if (!VA.isRegLoc())
       return false;
 
     // The calling-convention tables for x87 returns don't tell
     // the whole story.
     if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
     EVT SrcVT = TLI.getValueType(RV->getType());
     EVT DstVT = VA.getValVT();
     // Special handling for extended integers.
     if (SrcVT != DstVT) {
       if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16)
         return false;
 
       if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt())
         return false;
 
       assert(DstVT == MVT::i32 && "X86 should always ext to i32");
 
       if (SrcVT == MVT::i1) {
         if (Outs[0].Flags.isSExt())
           return false;
         SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
         SrcVT = MVT::i8;
       }
       unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND :
                                              ISD::SIGN_EXTEND;
       SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op,
                           SrcReg, /*TODO: Kill=*/false);
     }
 
     // Make the copy.
     unsigned DstReg = VA.getLocReg();
     const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
     // Avoid a cross-class copy. This is very unlikely.
     if (!SrcRC->contains(DstReg))
       return false;
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg);
 
     // Add register to return instruction.
     RetRegs.push_back(VA.getLocReg());
   }
 
   // The x86-64 ABI for returning structs by value requires that we copy
   // the sret argument into %rax for the return. We saved the argument into
   // a virtual register in the entry block, so now we copy the value out
   // and into %rax. We also do the same with %eax for Win32.
   if (F.hasStructRetAttr() &&
       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
     unsigned Reg = X86MFInfo->getSRetReturnReg();
     assert(Reg &&
            "SRetReturnReg should have been set in LowerFormalArguments()!");
     unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), RetReg).addReg(Reg);
     RetRegs.push_back(RetReg);
   }
 
   // Now emit the RET.
   MachineInstrBuilder MIB =
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL));
   for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
     MIB.addReg(RetRegs[i], RegState::Implicit);
   return true;
 }
 
 /// X86SelectLoad - Select and emit code to implement load instructions.
 ///
 bool X86FastISel::X86SelectLoad(const Instruction *I) {
   const LoadInst *LI = cast<LoadInst>(I);
 
   // Atomic loads need special handling.
   if (LI->isAtomic())
     return false;
 
   MVT VT;
   if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true))
     return false;
 
   const Value *Ptr = LI->getPointerOperand();
 
   X86AddressMode AM;
   if (!X86SelectAddress(Ptr, AM))
     return false;
 
   unsigned ResultReg = 0;
   if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg))
     return false;
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
   bool HasAVX = Subtarget->hasAVX();
   bool X86ScalarSSEf32 = Subtarget->hasSSE1();
   bool X86ScalarSSEf64 = Subtarget->hasSSE2();
 
   switch (VT.getSimpleVT().SimpleTy) {
   default:       return 0;
   case MVT::i8:  return X86::CMP8rr;
   case MVT::i16: return X86::CMP16rr;
   case MVT::i32: return X86::CMP32rr;
   case MVT::i64: return X86::CMP64rr;
   case MVT::f32:
     return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
   case MVT::f64:
     return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
   }
 }
 
 /// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
 /// of the comparison, return an opcode that works for the compare (e.g.
 /// CMP32ri) otherwise return 0.
 static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
   switch (VT.getSimpleVT().SimpleTy) {
   // Otherwise, we can't fold the immediate into this comparison.
   default: return 0;
   case MVT::i8: return X86::CMP8ri;
   case MVT::i16: return X86::CMP16ri;
   case MVT::i32: return X86::CMP32ri;
   case MVT::i64:
     // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
     // field.
     if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
       return X86::CMP64ri32;
     return 0;
   }
 }
 
 bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
                                      EVT VT) {
   unsigned Op0Reg = getRegForValue(Op0);
   if (Op0Reg == 0) return false;
 
   // Handle 'null' like i32/i64 0.
   if (isa<ConstantPointerNull>(Op1))
     Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext()));
 
   // We have two options: compare with register or immediate.  If the RHS of
   // the compare is an immediate that we can fold into this compare, use
   // CMPri, otherwise use CMPrr.
   if (const ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
     if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareImmOpc))
         .addReg(Op0Reg)
         .addImm(Op1C->getSExtValue());
       return true;
     }
   }
 
   unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
   if (CompareOpc == 0) return false;
 
   unsigned Op1Reg = getRegForValue(Op1);
   if (Op1Reg == 0) return false;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareOpc))
     .addReg(Op0Reg)
     .addReg(Op1Reg);
 
   return true;
 }
 
 bool X86FastISel::X86SelectCmp(const Instruction *I) {
   const CmpInst *CI = cast<CmpInst>(I);
 
   MVT VT;
   if (!isTypeLegal(I->getOperand(0)->getType(), VT))
     return false;
 
   // Try to optimize or fold the cmp.
   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
   unsigned ResultReg = 0;
   switch (Predicate) {
   default: break;
   case CmpInst::FCMP_FALSE: {
     ResultReg = createResultReg(&X86::GR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0),
             ResultReg);
     ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
                                            X86::sub_8bit);
     if (!ResultReg)
       return false;
     break;
   }
   case CmpInst::FCMP_TRUE: {
     ResultReg = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
             ResultReg).addImm(1);
     break;
   }
   }
 
   if (ResultReg) {
     updateValueMap(I, ResultReg);
     return true;
   }
 
   const Value *LHS = CI->getOperand(0);
   const Value *RHS = CI->getOperand(1);
 
   // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
   // We don't have to materialize a zero constant for this case and can just use
   // %x again on the RHS.
   if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
     const auto *RHSC = dyn_cast<ConstantFP>(RHS);
     if (RHSC && RHSC->isNullValue())
       RHS = LHS;
   }
 
   // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
   static unsigned SETFOpcTable[2][3] = {
     { X86::SETEr,  X86::SETNPr, X86::AND8rr },
     { X86::SETNEr, X86::SETPr,  X86::OR8rr  }
   };
   unsigned *SETFOpc = nullptr;
   switch (Predicate) {
   default: break;
   case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break;
   case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break;
   }
 
   ResultReg = createResultReg(&X86::GR8RegClass);
   if (SETFOpc) {
     if (!X86FastEmitCompare(LHS, RHS, VT))
       return false;
 
     unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
     unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
             FlagReg1);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
             FlagReg2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]),
             ResultReg).addReg(FlagReg1).addReg(FlagReg2);
     updateValueMap(I, ResultReg);
     return true;
   }
 
   X86::CondCode CC;
   bool SwapArgs;
   std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
   assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
   unsigned Opc = X86::getSETFromCond(CC);
 
   if (SwapArgs)
     std::swap(LHS, RHS);
 
   // Emit a compare of LHS/RHS.
   if (!X86FastEmitCompare(LHS, RHS, VT))
     return false;
 
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool X86FastISel::X86SelectZExt(const Instruction *I) {
   EVT DstVT = TLI.getValueType(I->getType());
   if (!TLI.isTypeLegal(DstVT))
     return false;
 
   unsigned ResultReg = getRegForValue(I->getOperand(0));
   if (ResultReg == 0)
     return false;
 
   // Handle zero-extension from i1 to i8, which is common.
   MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
   if (SrcVT.SimpleTy == MVT::i1) {
     // Set the high bits to zero.
     ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
     SrcVT = MVT::i8;
 
     if (ResultReg == 0)
       return false;
   }
 
   if (DstVT == MVT::i64) {
     // Handle extension to 64-bits via sub-register shenanigans.
     unsigned MovInst;
 
     switch (SrcVT.SimpleTy) {
     case MVT::i8:  MovInst = X86::MOVZX32rr8;  break;
     case MVT::i16: MovInst = X86::MOVZX32rr16; break;
     case MVT::i32: MovInst = X86::MOV32rr;     break;
     default: llvm_unreachable("Unexpected zext to i64 source type");
     }
 
     unsigned Result32 = createResultReg(&X86::GR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
       .addReg(ResultReg);
 
     ResultReg = createResultReg(&X86::GR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
             ResultReg)
       .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
   } else if (DstVT != MVT::i8) {
     ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
                            ResultReg, /*Kill=*/true);
     if (ResultReg == 0)
       return false;
   }
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool X86FastISel::X86SelectBranch(const Instruction *I) {
   // Unconditional branches are selected by tablegen-generated code.
   // Handle a conditional branch.
   const BranchInst *BI = cast<BranchInst>(I);
   MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
 
   // Fold the common case of a conditional branch with a comparison
   // in the same block (values defined on other blocks may not have
   // initialized registers).
   X86::CondCode CC;
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && CI->getParent() == I->getParent()) {
       EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
 
       // Try to optimize or fold the cmp.
       CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
       switch (Predicate) {
       default: break;
       case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true;
       case CmpInst::FCMP_TRUE:  fastEmitBranch(TrueMBB, DbgLoc); return true;
       }
 
       const Value *CmpLHS = CI->getOperand(0);
       const Value *CmpRHS = CI->getOperand(1);
 
       // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x,
       // 0.0.
       // We don't have to materialize a zero constant for this case and can just
       // use %x again on the RHS.
       if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
         const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
         if (CmpRHSC && CmpRHSC->isNullValue())
           CmpRHS = CmpLHS;
       }
 
       // Try to take advantage of fallthrough opportunities.
       if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
         std::swap(TrueMBB, FalseMBB);
         Predicate = CmpInst::getInversePredicate(Predicate);
       }
 
       // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition
       // code check. Instead two branch instructions are required to check all
       // the flags. First we change the predicate to a supported condition code,
       // which will be the first branch. Later one we will emit the second
       // branch.
       bool NeedExtraBranch = false;
       switch (Predicate) {
       default: break;
       case CmpInst::FCMP_OEQ:
         std::swap(TrueMBB, FalseMBB); // fall-through
       case CmpInst::FCMP_UNE:
         NeedExtraBranch = true;
         Predicate = CmpInst::FCMP_ONE;
         break;
       }
 
       bool SwapArgs;
       unsigned BranchOpc;
       std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate);
       assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 
       BranchOpc = X86::GetCondBranchFromCond(CC);
       if (SwapArgs)
         std::swap(CmpLHS, CmpRHS);
 
       // Emit a compare of the LHS and RHS, setting the flags.
       if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT))
         return false;
 
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
         .addMBB(TrueMBB);
 
       // X86 requires a second branch to handle UNE (and OEQ, which is mapped
       // to UNE above).
       if (NeedExtraBranch) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1))
           .addMBB(TrueMBB);
       }
 
       // Obtain the branch weight and add the TrueBB to the successor list.
       uint32_t BranchWeight = 0;
       if (FuncInfo.BPI)
         BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
                                                    TrueMBB->getBasicBlock());
       FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
 
       // Emits an unconditional branch to the FalseBB, obtains the branch
       // weight, and adds it to the successor list.
       fastEmitBranch(FalseMBB, DbgLoc);
 
       return true;
     }
   } else if (TruncInst *TI = dyn_cast<TruncInst>(BI->getCondition())) {
     // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which
     // typically happen for _Bool and C++ bools.
     MVT SourceVT;
     if (TI->hasOneUse() && TI->getParent() == I->getParent() &&
         isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) {
       unsigned TestOpc = 0;
       switch (SourceVT.SimpleTy) {
       default: break;
       case MVT::i8:  TestOpc = X86::TEST8ri; break;
       case MVT::i16: TestOpc = X86::TEST16ri; break;
       case MVT::i32: TestOpc = X86::TEST32ri; break;
       case MVT::i64: TestOpc = X86::TEST64ri32; break;
       }
       if (TestOpc) {
         unsigned OpReg = getRegForValue(TI->getOperand(0));
         if (OpReg == 0) return false;
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
           .addReg(OpReg).addImm(1);
 
         unsigned JmpOpc = X86::JNE_1;
         if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) {
           std::swap(TrueMBB, FalseMBB);
           JmpOpc = X86::JE_1;
         }
 
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc))
           .addMBB(TrueMBB);
         fastEmitBranch(FalseMBB, DbgLoc);
         uint32_t BranchWeight = 0;
         if (FuncInfo.BPI)
           BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
                                                      TrueMBB->getBasicBlock());
         FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
         return true;
       }
     }
   } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
     // Fake request the condition, otherwise the intrinsic might be completely
     // optimized away.
     unsigned TmpReg = getRegForValue(BI->getCondition());
     if (TmpReg == 0)
       return false;
 
     unsigned BranchOpc = X86::GetCondBranchFromCond(CC);
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc))
       .addMBB(TrueMBB);
     fastEmitBranch(FalseMBB, DbgLoc);
     uint32_t BranchWeight = 0;
     if (FuncInfo.BPI)
       BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
                                                  TrueMBB->getBasicBlock());
     FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
     return true;
   }
 
   // Otherwise do a clumsy setcc and re-test it.
   // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
   // in an explicit cast, so make sure to handle that correctly.
   unsigned OpReg = getRegForValue(BI->getCondition());
   if (OpReg == 0) return false;
 
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
     .addReg(OpReg).addImm(1);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1))
     .addMBB(TrueMBB);
   fastEmitBranch(FalseMBB, DbgLoc);
   uint32_t BranchWeight = 0;
   if (FuncInfo.BPI)
     BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(),
                                                TrueMBB->getBasicBlock());
   FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight);
   return true;
 }
 
 bool X86FastISel::X86SelectShift(const Instruction *I) {
   unsigned CReg = 0, OpReg = 0;
   const TargetRegisterClass *RC = nullptr;
   if (I->getType()->isIntegerTy(8)) {
     CReg = X86::CL;
     RC = &X86::GR8RegClass;
     switch (I->getOpcode()) {
     case Instruction::LShr: OpReg = X86::SHR8rCL; break;
     case Instruction::AShr: OpReg = X86::SAR8rCL; break;
     case Instruction::Shl:  OpReg = X86::SHL8rCL; break;
     default: return false;
     }
   } else if (I->getType()->isIntegerTy(16)) {
     CReg = X86::CX;
     RC = &X86::GR16RegClass;
     switch (I->getOpcode()) {
     case Instruction::LShr: OpReg = X86::SHR16rCL; break;
     case Instruction::AShr: OpReg = X86::SAR16rCL; break;
     case Instruction::Shl:  OpReg = X86::SHL16rCL; break;
     default: return false;
     }
   } else if (I->getType()->isIntegerTy(32)) {
     CReg = X86::ECX;
     RC = &X86::GR32RegClass;
     switch (I->getOpcode()) {
     case Instruction::LShr: OpReg = X86::SHR32rCL; break;
     case Instruction::AShr: OpReg = X86::SAR32rCL; break;
     case Instruction::Shl:  OpReg = X86::SHL32rCL; break;
     default: return false;
     }
   } else if (I->getType()->isIntegerTy(64)) {
     CReg = X86::RCX;
     RC = &X86::GR64RegClass;
     switch (I->getOpcode()) {
     case Instruction::LShr: OpReg = X86::SHR64rCL; break;
     case Instruction::AShr: OpReg = X86::SAR64rCL; break;
     case Instruction::Shl:  OpReg = X86::SHL64rCL; break;
     default: return false;
     }
   } else {
     return false;
   }
 
   MVT VT;
   if (!isTypeLegal(I->getType(), VT))
     return false;
 
   unsigned Op0Reg = getRegForValue(I->getOperand(0));
   if (Op0Reg == 0) return false;
 
   unsigned Op1Reg = getRegForValue(I->getOperand(1));
   if (Op1Reg == 0) return false;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
           CReg).addReg(Op1Reg);
 
   // The shift instruction uses X86::CL. If we defined a super-register
   // of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
   if (CReg != X86::CL)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::KILL), X86::CL)
       .addReg(CReg, RegState::Kill);
 
   unsigned ResultReg = createResultReg(RC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
     .addReg(Op0Reg);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool X86FastISel::X86SelectDivRem(const Instruction *I) {
   const static unsigned NumTypes = 4; // i8, i16, i32, i64
   const static unsigned NumOps   = 4; // SDiv, SRem, UDiv, URem
   const static bool S = true;  // IsSigned
   const static bool U = false; // !IsSigned
   const static unsigned Copy = TargetOpcode::COPY;
   // For the X86 DIV/IDIV instruction, in most cases the dividend
   // (numerator) must be in a specific register pair highreg:lowreg,
   // producing the quotient in lowreg and the remainder in highreg.
   // For most data types, to set up the instruction, the dividend is
   // copied into lowreg, and lowreg is sign-extended or zero-extended
   // into highreg.  The exception is i8, where the dividend is defined
   // as a single register rather than a register pair, and we
   // therefore directly sign-extend or zero-extend the dividend into
   // lowreg, instead of copying, and ignore the highreg.
   const static struct DivRemEntry {
     // The following portion depends only on the data type.
     const TargetRegisterClass *RC;
     unsigned LowInReg;  // low part of the register pair
     unsigned HighInReg; // high part of the register pair
     // The following portion depends on both the data type and the operation.
     struct DivRemResult {
     unsigned OpDivRem;        // The specific DIV/IDIV opcode to use.
     unsigned OpSignExtend;    // Opcode for sign-extending lowreg into
                               // highreg, or copying a zero into highreg.
     unsigned OpCopy;          // Opcode for copying dividend into lowreg, or
                               // zero/sign-extending into lowreg for i8.
     unsigned DivRemResultReg; // Register containing the desired result.
     bool IsOpSigned;          // Whether to use signed or unsigned form.
     } ResultTable[NumOps];
   } OpTable[NumTypes] = {
     { &X86::GR8RegClass,  X86::AX,  0, {
         { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AL,  S }, // SDiv
         { X86::IDIV8r,  0,            X86::MOVSX16rr8, X86::AH,  S }, // SRem
         { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AL,  U }, // UDiv
         { X86::DIV8r,   0,            X86::MOVZX16rr8, X86::AH,  U }, // URem
       }
     }, // i8
     { &X86::GR16RegClass, X86::AX,  X86::DX, {
         { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv
         { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem
         { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv
         { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem
       }
     }, // i16
     { &X86::GR32RegClass, X86::EAX, X86::EDX, {
         { X86::IDIV32r, X86::CDQ,     Copy,            X86::EAX, S }, // SDiv
         { X86::IDIV32r, X86::CDQ,     Copy,            X86::EDX, S }, // SRem
         { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EAX, U }, // UDiv
         { X86::DIV32r,  X86::MOV32r0, Copy,            X86::EDX, U }, // URem
       }
     }, // i32
     { &X86::GR64RegClass, X86::RAX, X86::RDX, {
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
         { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
         { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
       }
     }, // i64
   };
 
   MVT VT;
   if (!isTypeLegal(I->getType(), VT))
     return false;
 
   unsigned TypeIndex, OpIndex;
   switch (VT.SimpleTy) {
   default: return false;
   case MVT::i8:  TypeIndex = 0; break;
   case MVT::i16: TypeIndex = 1; break;
   case MVT::i32: TypeIndex = 2; break;
   case MVT::i64: TypeIndex = 3;
     if (!Subtarget->is64Bit())
       return false;
     break;
   }
 
   switch (I->getOpcode()) {
   default: llvm_unreachable("Unexpected div/rem opcode");
   case Instruction::SDiv: OpIndex = 0; break;
   case Instruction::SRem: OpIndex = 1; break;
   case Instruction::UDiv: OpIndex = 2; break;
   case Instruction::URem: OpIndex = 3; break;
   }
 
   const DivRemEntry &TypeEntry = OpTable[TypeIndex];
   const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
   unsigned Op0Reg = getRegForValue(I->getOperand(0));
   if (Op0Reg == 0)
     return false;
   unsigned Op1Reg = getRegForValue(I->getOperand(1));
   if (Op1Reg == 0)
     return false;
 
   // Move op0 into low-order input register.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg);
   // Zero-extend or sign-extend into high-order input register.
   if (OpEntry.OpSignExtend) {
     if (OpEntry.IsOpSigned)
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(OpEntry.OpSignExtend));
     else {
       unsigned Zero32 = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(X86::MOV32r0), Zero32);
 
       // Copy the zero into the appropriate sub/super/identical physical
       // register. Unfortunately the operations needed are not uniform enough
       // to fit neatly into the table above.
       if (VT.SimpleTy == MVT::i16) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
           .addReg(Zero32, 0, X86::sub_16bit);
       } else if (VT.SimpleTy == MVT::i32) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Copy), TypeEntry.HighInReg)
             .addReg(Zero32);
       } else if (VT.SimpleTy == MVT::i64) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
             .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
       }
     }
   }
   // Generate the DIV/IDIV instruction.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
   // For i8 remainder, we can't reference AH directly, as we'll end
   // up with bogus copies like %R9B = COPY %AH. Reference AX
   // instead to prevent AH references in a REX instruction.
   //
   // The current assumption of the fast register allocator is that isel
   // won't generate explicit references to the GPR8_NOREX registers. If
   // the allocator and/or the backend get enhanced to be more robust in
   // that regard, this can be, and should be, removed.
   unsigned ResultReg = 0;
   if ((I->getOpcode() == Instruction::SRem ||
        I->getOpcode() == Instruction::URem) &&
       OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
     unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
     unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(Copy), SourceSuperReg).addReg(X86::AX);
 
     // Shift AX right by 8 bits instead of using AH.
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri),
             ResultSuperReg).addReg(SourceSuperReg).addImm(8);
 
     // Now reference the 8-bit subreg of the result.
     ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
                                            /*Kill=*/true, X86::sub_8bit);
   }
   // Copy the result out of the physreg if we haven't already.
   if (!ResultReg) {
     ResultReg = createResultReg(TypeEntry.RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg)
         .addReg(OpEntry.DivRemResultReg);
   }
   updateValueMap(I, ResultReg);
 
   return true;
 }
 
 /// \brief Emit a conditional move instruction (if the are supported) to lower
 /// the select.
 bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
   // Check if the subtarget supports these instructions.
   if (!Subtarget->hasCMov())
     return false;
 
   // FIXME: Add support for i8.
   if (RetVT < MVT::i16 || RetVT > MVT::i64)
     return false;
 
   const Value *Cond = I->getOperand(0);
   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
   bool NeedTest = true;
   X86::CondCode CC = X86::COND_NE;
 
   // Optimize conditions coming from a compare if both instructions are in the
   // same basic block (values defined in other basic blocks may not have
   // initialized registers).
   const auto *CI = dyn_cast<CmpInst>(Cond);
   if (CI && (CI->getParent() == I->getParent())) {
     CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
 
     // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction.
     static unsigned SETFOpcTable[2][3] = {
       { X86::SETNPr, X86::SETEr , X86::TEST8rr },
       { X86::SETPr,  X86::SETNEr, X86::OR8rr   }
     };
     unsigned *SETFOpc = nullptr;
     switch (Predicate) {
     default: break;
     case CmpInst::FCMP_OEQ:
       SETFOpc = &SETFOpcTable[0][0];
       Predicate = CmpInst::ICMP_NE;
       break;
     case CmpInst::FCMP_UNE:
       SETFOpc = &SETFOpcTable[1][0];
       Predicate = CmpInst::ICMP_NE;
       break;
     }
 
     bool NeedSwap;
     std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate);
     assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code.");
 
     const Value *CmpLHS = CI->getOperand(0);
     const Value *CmpRHS = CI->getOperand(1);
     if (NeedSwap)
       std::swap(CmpLHS, CmpRHS);
 
     EVT CmpVT = TLI.getValueType(CmpLHS->getType());
     // Emit a compare of the LHS and RHS, setting the flags.
     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT))
       return false;
 
     if (SETFOpc) {
       unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
       unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]),
               FlagReg1);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]),
               FlagReg2);
       auto const &II = TII.get(SETFOpc[2]);
       if (II.getNumDefs()) {
         unsigned TmpReg = createResultReg(&X86::GR8RegClass);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
           .addReg(FlagReg2).addReg(FlagReg1);
       } else {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
           .addReg(FlagReg2).addReg(FlagReg1);
       }
     }
     NeedTest = false;
   } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
     // Fake request the condition, otherwise the intrinsic might be completely
     // optimized away.
     unsigned TmpReg = getRegForValue(Cond);
     if (TmpReg == 0)
       return false;
 
     NeedTest = false;
   }
 
   if (NeedTest) {
     // Selects operate on i1, however, CondReg is 8 bits width and may contain
     // garbage. Indeed, only the less significant bit is supposed to be
     // accurate. If we read more than the lsb, we may see non-zero values
     // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
     // the select. This is achieved by performing TEST against 1.
     unsigned CondReg = getRegForValue(Cond);
     if (CondReg == 0)
       return false;
     bool CondIsKill = hasTrivialKill(Cond);
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
       .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
   }
 
   const Value *LHS = I->getOperand(1);
   const Value *RHS = I->getOperand(2);
 
   unsigned RHSReg = getRegForValue(RHS);
   bool RHSIsKill = hasTrivialKill(RHS);
 
   unsigned LHSReg = getRegForValue(LHS);
   bool LHSIsKill = hasTrivialKill(LHS);
 
   if (!LHSReg || !RHSReg)
     return false;
 
   unsigned Opc = X86::getCMovFromCond(CC, RC->getSize());
   unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill,
                                        LHSReg, LHSIsKill);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 /// \brief Emit SSE instructions to lower the select.
 ///
 /// Try to use SSE1/SSE2 instructions to simulate a select without branches.
 /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary
 /// SSE instructions are available.
 bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
   // Optimize conditions coming from a compare if both instructions are in the
   // same basic block (values defined in other basic blocks may not have
   // initialized registers).
   const auto *CI = dyn_cast<FCmpInst>(I->getOperand(0));
   if (!CI || (CI->getParent() != I->getParent()))
     return false;
 
   if (I->getType() != CI->getOperand(0)->getType() ||
       !((Subtarget->hasSSE1() && RetVT == MVT::f32) ||
         (Subtarget->hasSSE2() && RetVT == MVT::f64)))
     return false;
 
   const Value *CmpLHS = CI->getOperand(0);
   const Value *CmpRHS = CI->getOperand(1);
   CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
 
   // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0.
   // We don't have to materialize a zero constant for this case and can just use
   // %x again on the RHS.
   if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) {
     const auto *CmpRHSC = dyn_cast<ConstantFP>(CmpRHS);
     if (CmpRHSC && CmpRHSC->isNullValue())
       CmpRHS = CmpLHS;
   }
 
   unsigned CC;
   bool NeedSwap;
   std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
   if (CC > 7)
     return false;
 
   if (NeedSwap)
     std::swap(CmpLHS, CmpRHS);
 
   static unsigned OpcTable[2][2][4] = {
     { { X86::CMPSSrr,  X86::FsANDPSrr,  X86::FsANDNPSrr,  X86::FsORPSrr  },
       { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr }  },
     { { X86::CMPSDrr,  X86::FsANDPDrr,  X86::FsANDNPDrr,  X86::FsORPDrr  },
       { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr }  }
   };
 
   bool HasAVX = Subtarget->hasAVX();
   unsigned *Opc = nullptr;
   switch (RetVT.SimpleTy) {
   default: return false;
   case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break;
   case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break;
   }
 
   const Value *LHS = I->getOperand(1);
   const Value *RHS = I->getOperand(2);
 
   unsigned LHSReg = getRegForValue(LHS);
   bool LHSIsKill = hasTrivialKill(LHS);
 
   unsigned RHSReg = getRegForValue(RHS);
   bool RHSIsKill = hasTrivialKill(RHS);
 
   unsigned CmpLHSReg = getRegForValue(CmpLHS);
   bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
 
   unsigned CmpRHSReg = getRegForValue(CmpRHS);
   bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
 
   if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
     return false;
 
   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
   unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
                                      CmpRHSReg, CmpRHSIsKill, CC);
   unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false,
                                     LHSReg, LHSIsKill);
   unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true,
                                      RHSReg, RHSIsKill);
   unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true,
                                        AndReg, /*IsKill=*/true);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
   // These are pseudo CMOV instructions and will be later expanded into control-
   // flow.
   unsigned Opc;
   switch (RetVT.SimpleTy) {
   default: return false;
   case MVT::i8:  Opc = X86::CMOV_GR8;  break;
   case MVT::i16: Opc = X86::CMOV_GR16; break;
   case MVT::i32: Opc = X86::CMOV_GR32; break;
   case MVT::f32: Opc = X86::CMOV_FR32; break;
   case MVT::f64: Opc = X86::CMOV_FR64; break;
   }
 
   const Value *Cond = I->getOperand(0);
   X86::CondCode CC = X86::COND_NE;
 
   // Optimize conditions coming from a compare if both instructions are in the
   // same basic block (values defined in other basic blocks may not have
   // initialized registers).
   const auto *CI = dyn_cast<CmpInst>(Cond);
   if (CI && (CI->getParent() == I->getParent())) {
     bool NeedSwap;
     std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate());
     if (CC > X86::LAST_VALID_COND)
       return false;
 
     const Value *CmpLHS = CI->getOperand(0);
     const Value *CmpRHS = CI->getOperand(1);
 
     if (NeedSwap)
       std::swap(CmpLHS, CmpRHS);
 
     EVT CmpVT = TLI.getValueType(CmpLHS->getType());
     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT))
       return false;
   } else {
     unsigned CondReg = getRegForValue(Cond);
     if (CondReg == 0)
       return false;
     bool CondIsKill = hasTrivialKill(Cond);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
       .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1);
   }
 
   const Value *LHS = I->getOperand(1);
   const Value *RHS = I->getOperand(2);
 
   unsigned LHSReg = getRegForValue(LHS);
   bool LHSIsKill = hasTrivialKill(LHS);
 
   unsigned RHSReg = getRegForValue(RHS);
   bool RHSIsKill = hasTrivialKill(RHS);
 
   if (!LHSReg || !RHSReg)
     return false;
 
   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
 
   unsigned ResultReg =
     fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool X86FastISel::X86SelectSelect(const Instruction *I) {
   MVT RetVT;
   if (!isTypeLegal(I->getType(), RetVT))
     return false;
 
   // Check if we can fold the select.
   if (const auto *CI = dyn_cast<CmpInst>(I->getOperand(0))) {
     CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
     const Value *Opnd = nullptr;
     switch (Predicate) {
     default:                              break;
     case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break;
     case CmpInst::FCMP_TRUE:  Opnd = I->getOperand(1); break;
     }
     // No need for a select anymore - this is an unconditional move.
     if (Opnd) {
       unsigned OpReg = getRegForValue(Opnd);
       if (OpReg == 0)
         return false;
       bool OpIsKill = hasTrivialKill(Opnd);
       const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
       unsigned ResultReg = createResultReg(RC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(OpReg, getKillRegState(OpIsKill));
       updateValueMap(I, ResultReg);
       return true;
     }
   }
 
   // First try to use real conditional move instructions.
   if (X86FastEmitCMoveSelect(RetVT, I))
     return true;
 
   // Try to use a sequence of SSE instructions to simulate a conditional move.
   if (X86FastEmitSSESelect(RetVT, I))
     return true;
 
   // Fall-back to pseudo conditional move instructions, which will be later
   // converted to control-flow.
   if (X86FastEmitPseudoSelect(RetVT, I))
     return true;
 
   return false;
 }
 
 bool X86FastISel::X86SelectFPExt(const Instruction *I) {
   // fpext from float to double.
   if (X86ScalarSSEf64 &&
       I->getType()->isDoubleTy()) {
     const Value *V = I->getOperand(0);
     if (V->getType()->isFloatTy()) {
       unsigned OpReg = getRegForValue(V);
       if (OpReg == 0) return false;
       unsigned ResultReg = createResultReg(&X86::FR64RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(X86::CVTSS2SDrr), ResultReg)
         .addReg(OpReg);
       updateValueMap(I, ResultReg);
       return true;
     }
   }
 
   return false;
 }
 
 bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
   if (X86ScalarSSEf64) {
     if (I->getType()->isFloatTy()) {
       const Value *V = I->getOperand(0);
       if (V->getType()->isDoubleTy()) {
         unsigned OpReg = getRegForValue(V);
         if (OpReg == 0) return false;
         unsigned ResultReg = createResultReg(&X86::FR32RegClass);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(X86::CVTSD2SSrr), ResultReg)
           .addReg(OpReg);
         updateValueMap(I, ResultReg);
         return true;
       }
     }
   }
 
   return false;
 }
 
 bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
   EVT DstVT = TLI.getValueType(I->getType());
 
   // This code only handles truncation to byte.
   if (DstVT != MVT::i8 && DstVT != MVT::i1)
     return false;
   if (!TLI.isTypeLegal(SrcVT))
     return false;
 
   unsigned InputReg = getRegForValue(I->getOperand(0));
   if (!InputReg)
     // Unhandled operand.  Halt "fast" selection and bail.
     return false;
 
   if (SrcVT == MVT::i8) {
     // Truncate from i8 to i1; no code needed.
     updateValueMap(I, InputReg);
     return true;
   }
 
   if (!Subtarget->is64Bit()) {
     // If we're on x86-32; we can't extract an i8 from a general register.
     // First issue a copy to GR16_ABCD or GR32_ABCD.
     const TargetRegisterClass *CopyRC =
       (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
     unsigned CopyReg = createResultReg(CopyRC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
     InputReg = CopyReg;
   }
 
   // Issue an extract_subreg.
   unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
                                                   InputReg, /*Kill=*/true,
                                                   X86::sub_8bit);
   if (!ResultReg)
     return false;
 
   updateValueMap(I, ResultReg);
   return true;
 }
 
 bool X86FastISel::IsMemcpySmall(uint64_t Len) {
   return Len <= (Subtarget->is64Bit() ? 32 : 16);
 }
 
 bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
                                      X86AddressMode SrcAM, uint64_t Len) {
 
   // Make sure we don't bloat code by inlining very large memcpy's.
   if (!IsMemcpySmall(Len))
     return false;
 
   bool i64Legal = Subtarget->is64Bit();
 
   // We don't care about alignment here since we just emit integer accesses.
   while (Len) {
     MVT VT;
     if (Len >= 8 && i64Legal)
       VT = MVT::i64;
     else if (Len >= 4)
       VT = MVT::i32;
     else if (Len >= 2)
       VT = MVT::i16;
     else
       VT = MVT::i8;
 
     unsigned Reg;
     bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg);
     RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM);
     assert(RV && "Failed to emit load or store??");
 
     unsigned Size = VT.getSizeInBits()/8;
     Len -= Size;
     DestAM.Disp += Size;
     SrcAM.Disp += Size;
   }
 
   return true;
 }
 
 bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
   // FIXME: Handle more intrinsics.
   switch (II->getIntrinsicID()) {
   default: return false;
   case Intrinsic::frameaddress: {
     Type *RetTy = II->getCalledFunction()->getReturnType();
 
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
 
     unsigned Opc;
     const TargetRegisterClass *RC = nullptr;
 
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Invalid result type for frameaddress.");
     case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break;
     case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break;
     }
 
     // This needs to be set before we call getPtrSizedFrameRegister, otherwise
     // we get the wrong frame register.
     MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo();
     MFI->setFrameAddressIsTaken(true);
 
     const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
         TM.getSubtargetImpl()->getRegisterInfo());
     unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*(FuncInfo.MF));
     assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
             (FrameReg == X86::EBP && VT == MVT::i32)) &&
            "Invalid Frame Register!");
 
     // Always make a copy of the frame register to to a vreg first, so that we
     // never directly reference the frame register (the TwoAddressInstruction-
     // Pass doesn't like that).
     unsigned SrcReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
 
     // Now recursively load from the frame address.
     // movq (%rbp), %rax
     // movq (%rax), %rax
     // movq (%rax), %rax
     // ...
     unsigned DestReg;
     unsigned Depth = cast<ConstantInt>(II->getOperand(0))->getZExtValue();
     while (Depth--) {
       DestReg = createResultReg(RC);
       addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                            TII.get(Opc), DestReg), SrcReg);
       SrcReg = DestReg;
     }
 
     updateValueMap(II, SrcReg);
     return true;
   }
   case Intrinsic::memcpy: {
     const MemCpyInst *MCI = cast<MemCpyInst>(II);
     // Don't handle volatile or variable length memcpys.
     if (MCI->isVolatile())
       return false;
 
     if (isa<ConstantInt>(MCI->getLength())) {
       // Small memcpy's are common enough that we want to do them
       // without a call if possible.
       uint64_t Len = cast<ConstantInt>(MCI->getLength())->getZExtValue();
       if (IsMemcpySmall(Len)) {
         X86AddressMode DestAM, SrcAM;
         if (!X86SelectAddress(MCI->getRawDest(), DestAM) ||
             !X86SelectAddress(MCI->getRawSource(), SrcAM))
           return false;
         TryEmitSmallMemcpy(DestAM, SrcAM, Len);
         return true;
       }
     }
 
     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
     if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth))
       return false;
 
     if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255)
       return false;
 
     return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2);
   }
   case Intrinsic::memset: {
     const MemSetInst *MSI = cast<MemSetInst>(II);
 
     if (MSI->isVolatile())
       return false;
 
     unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32;
     if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth))
       return false;
 
     if (MSI->getDestAddressSpace() > 255)
       return false;
 
     return lowerCallTo(II, "memset", II->getNumArgOperands() - 2);
   }
   case Intrinsic::stackprotector: {
     // Emit code to store the stack guard onto the stack.
     EVT PtrTy = TLI.getPointerTy();
 
     const Value *Op1 = II->getArgOperand(0); // The guard's value.
     const AllocaInst *Slot = cast<AllocaInst>(II->getArgOperand(1));
 
     MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]);
 
     // Grab the frame index.
     X86AddressMode AM;
     if (!X86SelectAddress(Slot, AM)) return false;
     if (!X86FastEmitStore(PtrTy, Op1, AM)) return false;
     return true;
   }
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
     X86AddressMode AM;
     assert(DI->getAddress() && "Null address should be checked earlier!");
     if (!X86SelectAddress(DI->getAddress(), AM))
       return false;
     const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
     // FIXME may need to add RegState::Debug to any registers produced,
     // although ESP/EBP should be the only ones at the moment.
     addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM)
         .addImm(0)
         .addMetadata(DI->getVariable())
         .addMetadata(DI->getExpression());
     return true;
   }
   case Intrinsic::trap: {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP));
     return true;
   }
   case Intrinsic::sqrt: {
     if (!Subtarget->hasSSE1())
       return false;
 
     Type *RetTy = II->getCalledFunction()->getReturnType();
 
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
 
     // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT
     // is not generated by FastISel yet.
     // FIXME: Update this code once tablegen can handle it.
     static const unsigned SqrtOpc[2][2] = {
       {X86::SQRTSSr, X86::VSQRTSSr},
       {X86::SQRTSDr, X86::VSQRTSDr}
     };
     bool HasAVX = Subtarget->hasAVX();
     unsigned Opc;
     const TargetRegisterClass *RC;
     switch (VT.SimpleTy) {
     default: return false;
     case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break;
     case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break;
     }
 
     const Value *SrcVal = II->getArgOperand(0);
     unsigned SrcReg = getRegForValue(SrcVal);
 
     if (SrcReg == 0)
       return false;
 
     unsigned ImplicitDefReg = 0;
     if (HasAVX) {
       ImplicitDefReg = createResultReg(RC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
     }
 
     unsigned ResultReg = createResultReg(RC);
     MachineInstrBuilder MIB;
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
                   ResultReg);
 
     if (ImplicitDefReg)
       MIB.addReg(ImplicitDefReg);
 
     MIB.addReg(SrcReg);
 
     updateValueMap(II, ResultReg);
     return true;
   }
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::ssub_with_overflow:
   case Intrinsic::usub_with_overflow:
   case Intrinsic::smul_with_overflow:
   case Intrinsic::umul_with_overflow: {
     // This implements the basic lowering of the xalu with overflow intrinsics
     // into add/sub/mul followed by either seto or setb.
     const Function *Callee = II->getCalledFunction();
     auto *Ty = cast<StructType>(Callee->getReturnType());
     Type *RetTy = Ty->getTypeAtIndex(0U);
     Type *CondTy = Ty->getTypeAtIndex(1);
 
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
 
     if (VT < MVT::i8 || VT > MVT::i64)
       return false;
 
     const Value *LHS = II->getArgOperand(0);
     const Value *RHS = II->getArgOperand(1);
 
     // Canonicalize immediate to the RHS.
     if (isa<ConstantInt>(LHS) && !isa<ConstantInt>(RHS) &&
         isCommutativeIntrinsic(II))
       std::swap(LHS, RHS);
 
     bool UseIncDec = false;
     if (isa<ConstantInt>(RHS) && cast<ConstantInt>(RHS)->isOne())
       UseIncDec = true;
 
     unsigned BaseOpc, CondOpc;
     switch (II->getIntrinsicID()) {
     default: llvm_unreachable("Unexpected intrinsic!");
     case Intrinsic::sadd_with_overflow:
       BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD);
       CondOpc = X86::SETOr;
       break;
     case Intrinsic::uadd_with_overflow:
       BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break;
     case Intrinsic::ssub_with_overflow:
       BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB);
       CondOpc = X86::SETOr;
       break;
     case Intrinsic::usub_with_overflow:
       BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break;
     case Intrinsic::smul_with_overflow:
       BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break;
     case Intrinsic::umul_with_overflow:
       BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break;
     }
 
     unsigned LHSReg = getRegForValue(LHS);
     if (LHSReg == 0)
       return false;
     bool LHSIsKill = hasTrivialKill(LHS);
 
     unsigned ResultReg = 0;
     // Check if we have an immediate version.
     if (const auto *CI = dyn_cast<ConstantInt>(RHS)) {
       static const unsigned Opc[2][4] = {
         { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r },
         { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r }
       };
 
       if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) {
         ResultReg = createResultReg(TLI.getRegClassFor(VT));
         bool IsDec = BaseOpc == X86ISD::DEC;
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                 TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg)
           .addReg(LHSReg, getKillRegState(LHSIsKill));
       } else
         ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill,
                                 CI->getZExtValue());
     }
 
     unsigned RHSReg;
     bool RHSIsKill;
     if (!ResultReg) {
       RHSReg = getRegForValue(RHS);
       if (RHSReg == 0)
         return false;
       RHSIsKill = hasTrivialKill(RHS);
       ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg,
                               RHSIsKill);
     }
 
     // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit
     // it manually.
     if (BaseOpc == X86ISD::UMUL && !ResultReg) {
       static const unsigned MULOpc[] =
         { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r };
       static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX };
       // First copy the first operand into RAX, which is an implicit input to
       // the X86::MUL*r instruction.
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8])
         .addReg(LHSReg, getKillRegState(LHSIsKill));
       ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8],
                                  TLI.getRegClassFor(VT), RHSReg, RHSIsKill);
     } else if (BaseOpc == X86ISD::SMUL && !ResultReg) {
       static const unsigned MULOpc[] =
         { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr };
       if (VT == MVT::i8) {
         // Copy the first operand into AL, which is an implicit input to the
         // X86::IMUL8r instruction.
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                TII.get(TargetOpcode::COPY), X86::AL)
           .addReg(LHSReg, getKillRegState(LHSIsKill));
         ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg,
                                    RHSIsKill);
       } else
         ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8],
                                     TLI.getRegClassFor(VT), LHSReg, LHSIsKill,
                                     RHSReg, RHSIsKill);
     }
 
     if (!ResultReg)
       return false;
 
     unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy);
     assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc),
             ResultReg2);
 
     updateValueMap(II, ResultReg, 2);
     return true;
   }
   case Intrinsic::x86_sse_cvttss2si:
   case Intrinsic::x86_sse_cvttss2si64:
   case Intrinsic::x86_sse2_cvttsd2si:
   case Intrinsic::x86_sse2_cvttsd2si64: {
     bool IsInputDouble;
     switch (II->getIntrinsicID()) {
     default: llvm_unreachable("Unexpected intrinsic.");
     case Intrinsic::x86_sse_cvttss2si:
     case Intrinsic::x86_sse_cvttss2si64:
       if (!Subtarget->hasSSE1())
         return false;
       IsInputDouble = false;
       break;
     case Intrinsic::x86_sse2_cvttsd2si:
     case Intrinsic::x86_sse2_cvttsd2si64:
       if (!Subtarget->hasSSE2())
         return false;
       IsInputDouble = true;
       break;
     }
 
     Type *RetTy = II->getCalledFunction()->getReturnType();
     MVT VT;
     if (!isTypeLegal(RetTy, VT))
       return false;
 
     static const unsigned CvtOpc[2][2][2] = {
       { { X86::CVTTSS2SIrr,   X86::VCVTTSS2SIrr   },
         { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr }  },
       { { X86::CVTTSD2SIrr,   X86::VCVTTSD2SIrr   },
         { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr }  }
     };
     bool HasAVX = Subtarget->hasAVX();
     unsigned Opc;
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected result type.");
     case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break;
     case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break;
     }
 
     // Check if we can fold insertelement instructions into the convert.
     const Value *Op = II->getArgOperand(0);
     while (auto *IE = dyn_cast<InsertElementInst>(Op)) {
       const Value *Index = IE->getOperand(2);
       if (!isa<ConstantInt>(Index))
         break;
       unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
 
       if (Idx == 0) {
         Op = IE->getOperand(1);
         break;
       }
       Op = IE->getOperand(0);
     }
 
     unsigned Reg = getRegForValue(Op);
     if (Reg == 0)
       return false;
 
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(Reg);
 
     updateValueMap(II, ResultReg);
     return true;
   }
   }
 }
 
 bool X86FastISel::fastLowerArguments() {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
   const Function *F = FuncInfo.Fn;
   if (F->isVarArg())
     return false;
 
   CallingConv::ID CC = F->getCallingConv();
   if (CC != CallingConv::C)
     return false;
 
   if (Subtarget->isCallingConvWin64(CC))
     return false;
 
   if (!Subtarget->is64Bit())
     return false;
 
   // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
   unsigned GPRCnt = 0;
   unsigned FPRCnt = 0;
   unsigned Idx = 0;
   for (auto const &Arg : F->args()) {
     // The first argument is at index 1.
     ++Idx;
     if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
         F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
         F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
         F->getAttributes().hasAttribute(Idx, Attribute::Nest))
       return false;
 
     Type *ArgTy = Arg.getType();
     if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
       return false;
 
     EVT ArgVT = TLI.getValueType(ArgTy);
     if (!ArgVT.isSimple()) return false;
     switch (ArgVT.getSimpleVT().SimpleTy) {
     default: return false;
     case MVT::i32:
     case MVT::i64:
       ++GPRCnt;
       break;
     case MVT::f32:
     case MVT::f64:
       if (!Subtarget->hasSSE1())
         return false;
       ++FPRCnt;
       break;
     }
 
     if (GPRCnt > 6)
       return false;
 
     if (FPRCnt > 8)
       return false;
   }
 
   static const MCPhysReg GPR32ArgRegs[] = {
     X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
   };
   static const MCPhysReg GPR64ArgRegs[] = {
     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
   };
   static const MCPhysReg XMMArgRegs[] = {
     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
   };
 
   unsigned GPRIdx = 0;
   unsigned FPRIdx = 0;
   for (auto const &Arg : F->args()) {
     MVT VT = TLI.getSimpleValueType(Arg.getType());
     const TargetRegisterClass *RC = TLI.getRegClassFor(VT);
     unsigned SrcReg;
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected value type.");
     case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break;
     case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break;
     case MVT::f32: // fall-through
     case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
     }
     unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
     unsigned ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
       .addReg(DstReg, getKillRegState(true));
     updateValueMap(&Arg, ResultReg);
   }
   return true;
 }
 
 static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget,
                                            CallingConv::ID CC,
                                            ImmutableCallSite *CS) {
   if (Subtarget->is64Bit())
     return 0;
   if (Subtarget->getTargetTriple().isOSMSVCRT())
     return 0;
   if (CC == CallingConv::Fast || CC == CallingConv::GHC ||
       CC == CallingConv::HiPE)
     return 0;
   if (CS && !CS->paramHasAttr(1, Attribute::StructRet))
     return 0;
   if (CS && CS->paramHasAttr(1, Attribute::InReg))
     return 0;
   return 4;
 }
 
 bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   auto &OutVals       = CLI.OutVals;
   auto &OutFlags      = CLI.OutFlags;
   auto &OutRegs       = CLI.OutRegs;
   auto &Ins           = CLI.Ins;
   auto &InRegs        = CLI.InRegs;
   CallingConv::ID CC  = CLI.CallConv;
   bool &IsTailCall    = CLI.IsTailCall;
   bool IsVarArg       = CLI.IsVarArg;
   const Value *Callee = CLI.Callee;
   const char *SymName = CLI.SymName;
 
   bool Is64Bit        = Subtarget->is64Bit();
   bool IsWin64        = Subtarget->isCallingConvWin64(CC);
 
   // Handle only C, fastcc, and webkit_js calling conventions for now.
   switch (CC) {
   default: return false;
   case CallingConv::C:
   case CallingConv::Fast:
   case CallingConv::WebKit_JS:
   case CallingConv::X86_FastCall:
   case CallingConv::X86_64_Win64:
   case CallingConv::X86_64_SysV:
     break;
   }
 
   // Allow SelectionDAG isel to handle tail calls.
   if (IsTailCall)
     return false;
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
   // tail call optimization. Fastisel doesn't know how to do that.
   if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt)
     return false;
 
   // Don't know how to handle Win64 varargs yet.  Nothing special needed for
   // x86-32. Special handling for x86-64 is implemented.
   if (IsVarArg && IsWin64)
     return false;
 
   // Don't know about inalloca yet.
   if (CLI.CS && CLI.CS->hasInAllocaArgument())
     return false;
 
   // Fast-isel doesn't know about callee-pop yet.
   if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
                        TM.Options.GuaranteedTailCallOpt))
     return false;
 
   SmallVector<MVT, 16> OutVTs;
   SmallVector<unsigned, 16> ArgRegs;
 
   // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra
   // instruction. This is safe because it is common to all FastISel supported
   // calling conventions on x86.
   for (int i = 0, e = OutVals.size(); i != e; ++i) {
     Value *&Val = OutVals[i];
     ISD::ArgFlagsTy Flags = OutFlags[i];
     if (auto *CI = dyn_cast<ConstantInt>(Val)) {
       if (CI->getBitWidth() < 32) {
         if (Flags.isSExt())
           Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext()));
         else
           Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext()));
       }
     }
 
     // Passing bools around ends up doing a trunc to i1 and passing it.
     // Codegen this as an argument + "and 1".
     MVT VT;
     auto *TI = dyn_cast<TruncInst>(Val);
     unsigned ResultReg;
     if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
               (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
               TI->hasOneUse()) {
       Value *PrevVal = TI->getOperand(0);
       ResultReg = getRegForValue(PrevVal);
 
       if (!ResultReg)
         return false;
 
       if (!isTypeLegal(PrevVal->getType(), VT))
         return false;
 
       ResultReg =
         fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
     } else {
       if (!isTypeLegal(Val->getType(), VT))
         return false;
       ResultReg = getRegForValue(Val);
     }
 
     if (!ResultReg)
       return false;
 
     ArgRegs.push_back(ResultReg);
     OutVTs.push_back(VT);
   }
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext());
 
   // Allocate shadow area for Win64
   if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
   CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
   // Issue CALLSEQ_START
   unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown))
-    .addImm(NumBytes);
+    .addImm(NumBytes).addImm(0);
 
   // Walk the register/memloc assignments, inserting copies/loads.
   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
       TM.getSubtargetImpl()->getRegisterInfo());
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign const &VA = ArgLocs[i];
     const Value *ArgVal = OutVals[VA.getValNo()];
     MVT ArgVT = OutVTs[VA.getValNo()];
 
     if (ArgVT == MVT::x86mmx)
       return false;
 
     unsigned ArgReg = ArgRegs[VA.getValNo()];
 
     // Promote the value if needed.
     switch (VA.getLocInfo()) {
     case CCValAssign::Full: break;
     case CCValAssign::SExt: {
       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
              "Unexpected extend");
       bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
                                        ArgVT, ArgReg);
       assert(Emitted && "Failed to emit a sext!"); (void)Emitted;
       ArgVT = VA.getLocVT();
       break;
     }
     case CCValAssign::ZExt: {
       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
              "Unexpected extend");
       bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
                                        ArgVT, ArgReg);
       assert(Emitted && "Failed to emit a zext!"); (void)Emitted;
       ArgVT = VA.getLocVT();
       break;
     }
     case CCValAssign::AExt: {
       assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() &&
              "Unexpected extend");
       bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg,
                                        ArgVT, ArgReg);
       if (!Emitted)
         Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg,
                                     ArgVT, ArgReg);
       if (!Emitted)
         Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg,
                                     ArgVT, ArgReg);
 
       assert(Emitted && "Failed to emit a aext!"); (void)Emitted;
       ArgVT = VA.getLocVT();
       break;
     }
     case CCValAssign::BCvt: {
       ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg,
                           /*TODO: Kill=*/false);
       assert(ArgReg && "Failed to emit a bitcast!");
       ArgVT = VA.getLocVT();
       break;
     }
     case CCValAssign::VExt:
       // VExt has not been implemented, so this should be impossible to reach
       // for now.  However, fallback to Selection DAG isel once implemented.
       return false;
     case CCValAssign::AExtUpper:
     case CCValAssign::SExtUpper:
     case CCValAssign::ZExtUpper:
     case CCValAssign::FPExt:
       llvm_unreachable("Unexpected loc info!");
     case CCValAssign::Indirect:
       // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully
       // support this.
       return false;
     }
 
     if (VA.isRegLoc()) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
       OutRegs.push_back(VA.getLocReg());
     } else {
       assert(VA.isMemLoc());
 
       // Don't emit stores for undef values.
       if (isa<UndefValue>(ArgVal))
         continue;
 
       unsigned LocMemOffset = VA.getLocMemOffset();
       X86AddressMode AM;
       AM.Base.Reg = RegInfo->getStackRegister();
       AM.Disp = LocMemOffset;
       ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
       unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
         MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore,
         ArgVT.getStoreSize(), Alignment);
       if (Flags.isByVal()) {
         X86AddressMode SrcAM;
         SrcAM.Base.Reg = ArgReg;
         if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize()))
           return false;
       } else if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal)) {
         // If this is a really simple value, emit this with the Value* version
         // of X86FastEmitStore.  If it isn't simple, we don't want to do this,
         // as it can cause us to reevaluate the argument.
         if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO))
           return false;
       } else {
         bool ValIsKill = hasTrivialKill(ArgVal);
         if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO))
           return false;
       }
     }
   }
 
   // ELF / PIC requires GOT in the EBX register before function calls via PLT
   // GOT pointer.
   if (Subtarget->isPICStyleGOT()) {
     unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base);
   }
 
   if (Is64Bit && IsVarArg && !IsWin64) {
     // From AMD64 ABI document:
     // For calls that may call functions that use varargs or stdargs
     // (prototype-less calls or calls to functions containing ellipsis (...) in
     // the declaration) %al is used as hidden argument to specify the number
     // of SSE registers used. The contents of %al do not need to match exactly
     // the number of registers, but must be an ubound on the number of SSE
     // registers used and is in the range 0 - 8 inclusive.
 
     // Count the number of XMM registers allocated.
     static const MCPhysReg XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
     assert((Subtarget->hasSSE1() || !NumXMMRegs)
            && "SSE registers cannot be used when SSE is disabled");
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri),
             X86::AL).addImm(NumXMMRegs);
   }
 
   // Materialize callee address in a register. FIXME: GV address can be
   // handled with a CALLpcrel32 instead.
   X86AddressMode CalleeAM;
   if (!X86SelectCallAddress(Callee, CalleeAM))
     return false;
 
   unsigned CalleeOp = 0;
   const GlobalValue *GV = nullptr;
   if (CalleeAM.GV != nullptr) {
     GV = CalleeAM.GV;
   } else if (CalleeAM.Base.Reg != 0) {
     CalleeOp = CalleeAM.Base.Reg;
   } else
     return false;
 
   // Issue the call.
   MachineInstrBuilder MIB;
   if (CalleeOp) {
     // Register-indirect call.
     unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r;
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc))
       .addReg(CalleeOp);
   } else {
     // Direct call.
     assert(GV && "Not a direct call");
     unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
 
     // See if we need any target-specific flags on the GV operand.
     unsigned char OpFlags = 0;
 
     // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
     // external symbols most go through the PLT in PIC mode.  If the symbol
     // has hidden or protected visibility, or if it is static or local, then
     // we don't need to use the PLT - we can directly call it.
     if (Subtarget->isTargetELF() &&
         TM.getRelocationModel() == Reloc::PIC_ &&
         GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
       OpFlags = X86II::MO_PLT;
     } else if (Subtarget->isPICStyleStubAny() &&
                (GV->isDeclaration() || GV->isWeakForLinker()) &&
                (!Subtarget->getTargetTriple().isMacOSX() ||
                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
       // PC-relative references to external symbols should go through $stub,
       // unless we're building with the leopard linker or later, which
       // automatically synthesizes these stubs.
       OpFlags = X86II::MO_DARWIN_STUB;
     }
 
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
     if (SymName)
       MIB.addExternalSymbol(SymName, OpFlags);
     else
       MIB.addGlobalAddress(GV, 0, OpFlags);
   }
 
   // Add a register mask operand representing the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
   MIB.addRegMask(TRI.getCallPreservedMask(CC));
 
   // Add an implicit use GOT pointer in EBX.
   if (Subtarget->isPICStyleGOT())
     MIB.addReg(X86::EBX, RegState::Implicit);
 
   if (Is64Bit && IsVarArg && !IsWin64)
     MIB.addReg(X86::AL, RegState::Implicit);
 
   // Add implicit physical register uses to the call.
   for (auto Reg : OutRegs)
     MIB.addReg(Reg, RegState::Implicit);
 
   // Issue CALLSEQ_END
   unsigned NumBytesForCalleeToPop =
     computeBytesPoppedByCallee(Subtarget, CC, CLI.CS);
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
     .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
 
   // Now handle call return values.
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs,
                     CLI.RetTy->getContext());
   CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
   unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     EVT CopyVT = VA.getValVT();
     unsigned CopyReg = ResultReg + i;
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
       report_fatal_error("SSE register return with SSE disabled");
     }
 
     // If we prefer to use the value in xmm registers, copy it out as f80 and
     // use a truncate to move it from fp stack reg to xmm reg.
     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
         isScalarFPTypeInSSEReg(VA.getValVT())) {
       CopyVT = MVT::f80;
       CopyReg = createResultReg(&X86::RFP80RegClass);
     }
 
     // Copy out the result.
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
     InRegs.push_back(VA.getLocReg());
 
     // Round the f80 to the right size, which also moves it to the appropriate
     // xmm register. This is accomplished by storing the f80 value in memory
     // and then loading it back.
     if (CopyVT != VA.getValVT()) {
       EVT ResVT = VA.getValVT();
       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
       unsigned MemSize = ResVT.getSizeInBits()/8;
       int FI = MFI.CreateStackObject(MemSize, MemSize, false);
       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                 TII.get(Opc)), FI)
         .addReg(CopyReg);
       Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                 TII.get(Opc), ResultReg + i), FI);
     }
   }
 
   CLI.ResultReg = ResultReg;
   CLI.NumResultRegs = RVLocs.size();
   CLI.Call = MIB;
 
   return true;
 }
 
 bool
 X86FastISel::fastSelectInstruction(const Instruction *I)  {
   switch (I->getOpcode()) {
   default: break;
   case Instruction::Load:
     return X86SelectLoad(I);
   case Instruction::Store:
     return X86SelectStore(I);
   case Instruction::Ret:
     return X86SelectRet(I);
   case Instruction::ICmp:
   case Instruction::FCmp:
     return X86SelectCmp(I);
   case Instruction::ZExt:
     return X86SelectZExt(I);
   case Instruction::Br:
     return X86SelectBranch(I);
   case Instruction::LShr:
   case Instruction::AShr:
   case Instruction::Shl:
     return X86SelectShift(I);
   case Instruction::SDiv:
   case Instruction::UDiv:
   case Instruction::SRem:
   case Instruction::URem:
     return X86SelectDivRem(I);
   case Instruction::Select:
     return X86SelectSelect(I);
   case Instruction::Trunc:
     return X86SelectTrunc(I);
   case Instruction::FPExt:
     return X86SelectFPExt(I);
   case Instruction::FPTrunc:
     return X86SelectFPTrunc(I);
   case Instruction::IntToPtr: // Deliberate fall-through.
   case Instruction::PtrToInt: {
     EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
     EVT DstVT = TLI.getValueType(I->getType());
     if (DstVT.bitsGT(SrcVT))
       return X86SelectZExt(I);
     if (DstVT.bitsLT(SrcVT))
       return X86SelectTrunc(I);
     unsigned Reg = getRegForValue(I->getOperand(0));
     if (Reg == 0) return false;
     updateValueMap(I, Reg);
     return true;
   }
   }
 
   return false;
 }
 
 unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
   if (VT > MVT::i64)
     return 0;
 
   uint64_t Imm = CI->getZExtValue();
   if (Imm == 0) {
     unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected value type");
     case MVT::i1:
     case MVT::i8:
       return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
                                         X86::sub_8bit);
     case MVT::i16:
       return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true,
                                         X86::sub_16bit);
     case MVT::i32:
       return SrcReg;
     case MVT::i64: {
       unsigned ResultReg = createResultReg(&X86::GR64RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
         .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
       return ResultReg;
     }
     }
   }
 
   unsigned Opc = 0;
   switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected value type");
   case MVT::i1:  VT = MVT::i8; // fall-through
   case MVT::i8:  Opc = X86::MOV8ri;  break;
   case MVT::i16: Opc = X86::MOV16ri; break;
   case MVT::i32: Opc = X86::MOV32ri; break;
   case MVT::i64: {
     if (isUInt<32>(Imm))
       Opc = X86::MOV32ri;
     else if (isInt<32>(Imm))
       Opc = X86::MOV64ri32;
     else
       Opc = X86::MOV64ri;
     break;
   }
   }
   if (VT == MVT::i64 && Opc == X86::MOV32ri) {
     unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm);
     unsigned ResultReg = createResultReg(&X86::GR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
       .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
     return ResultReg;
   }
   return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm);
 }
 
 unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
   if (CFP->isNullValue())
     return fastMaterializeFloatZero(CFP);
 
   // Can't handle alternate code models yet.
   CodeModel::Model CM = TM.getCodeModel();
   if (CM != CodeModel::Small && CM != CodeModel::Large)
     return 0;
 
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = nullptr;
   switch (VT.SimpleTy) {
   default: return 0;
   case MVT::f32:
     if (X86ScalarSSEf32) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
       RC  = &X86::FR32RegClass;
     } else {
       Opc = X86::LD_Fp32m;
       RC  = &X86::RFP32RegClass;
     }
     break;
   case MVT::f64:
     if (X86ScalarSSEf64) {
       Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
       RC  = &X86::FR64RegClass;
     } else {
       Opc = X86::LD_Fp64m;
       RC  = &X86::RFP64RegClass;
     }
     break;
   case MVT::f80:
     // No f80 support yet.
     return 0;
   }
 
   // MachineConstantPool wants an explicit alignment.
   unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
   if (Align == 0) {
     // Alignment of vector types. FIXME!
     Align = DL.getTypeAllocSize(CFP->getType());
   }
 
   // x86-32 PIC requires a PIC base register for constant pools.
   unsigned PICBase = 0;
   unsigned char OpFlag = 0;
   if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
     OpFlag = X86II::MO_PIC_BASE_OFFSET;
     PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
   } else if (Subtarget->isPICStyleGOT()) {
     OpFlag = X86II::MO_GOTOFF;
     PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
   } else if (Subtarget->isPICStyleRIPRel() &&
              TM.getCodeModel() == CodeModel::Small) {
     PICBase = X86::RIP;
   }
 
   // Create the load from the constant pool.
   unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
   unsigned ResultReg = createResultReg(RC);
 
   if (CM == CodeModel::Large) {
     unsigned AddrReg = createResultReg(&X86::GR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
             AddrReg)
       .addConstantPoolIndex(CPI, 0, OpFlag);
     MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                       TII.get(Opc), ResultReg);
     addDirectMem(MIB, AddrReg);
     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
       MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
       TM.getSubtargetImpl()->getDataLayout()->getPointerSize(), Align);
     MIB->addMemOperand(*FuncInfo.MF, MMO);
     return ResultReg;
   }
 
   addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                    TII.get(Opc), ResultReg),
                            CPI, PICBase, OpFlag);
   return ResultReg;
 }
 
 unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
   // Can't handle alternate code models yet.
   if (TM.getCodeModel() != CodeModel::Small)
     return 0;
 
   // Materialize addresses with LEA/MOV instructions.
   X86AddressMode AM;
   if (X86SelectAddress(GV, AM)) {
     // If the expression is just a basereg, then we're done, otherwise we need
     // to emit an LEA.
     if (AM.BaseType == X86AddressMode::RegBase &&
         AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
       return AM.Base.Reg;
 
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
     if (TM.getRelocationModel() == Reloc::Static &&
         TLI.getPointerTy() == MVT::i64) {
       // The displacement code could be more than 32 bits away so we need to use
       // an instruction with a 64 bit immediate
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
               ResultReg)
         .addGlobalAddress(GV);
     } else {
       unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
       addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                              TII.get(Opc), ResultReg), AM);
     }
     return ResultReg;
   }
   return 0;
 }
 
 unsigned X86FastISel::fastMaterializeConstant(const Constant *C) {
   EVT CEVT = TLI.getValueType(C->getType(), true);
 
   // Only handle simple types.
   if (!CEVT.isSimple())
     return 0;
   MVT VT = CEVT.getSimpleVT();
 
   if (const auto *CI = dyn_cast<ConstantInt>(C))
     return X86MaterializeInt(CI, VT);
   else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
     return X86MaterializeFP(CFP, VT);
   else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
     return X86MaterializeGV(GV, VT);
 
   return 0;
 }
 
 unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
   // Fail on dynamic allocas. At this point, getRegForValue has already
   // checked its CSE maps, so if we're here trying to handle a dynamic
   // alloca, we're not going to succeed. X86SelectAddress has a
   // check for dynamic allocas, because it's called directly from
   // various places, but targetMaterializeAlloca also needs a check
   // in order to avoid recursion between getRegForValue,
   // X86SelectAddrss, and targetMaterializeAlloca.
   if (!FuncInfo.StaticAllocaMap.count(C))
     return 0;
   assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?");
 
   X86AddressMode AM;
   if (!X86SelectAddress(C, AM))
     return 0;
   unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
   const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
   unsigned ResultReg = createResultReg(RC);
   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                          TII.get(Opc), ResultReg), AM);
   return ResultReg;
 }
 
 unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
   MVT VT;
   if (!isTypeLegal(CF->getType(), VT))
     return 0;
 
   // Get opcode and regclass for the given zero.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = nullptr;
   switch (VT.SimpleTy) {
   default: return 0;
   case MVT::f32:
     if (X86ScalarSSEf32) {
       Opc = X86::FsFLD0SS;
       RC  = &X86::FR32RegClass;
     } else {
       Opc = X86::LD_Fp032;
       RC  = &X86::RFP32RegClass;
     }
     break;
   case MVT::f64:
     if (X86ScalarSSEf64) {
       Opc = X86::FsFLD0SD;
       RC  = &X86::FR64RegClass;
     } else {
       Opc = X86::LD_Fp064;
       RC  = &X86::RFP64RegClass;
     }
     break;
   case MVT::f80:
     // No f80 support yet.
     return 0;
   }
 
   unsigned ResultReg = createResultReg(RC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
   return ResultReg;
 }
 
 
 bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
                                       const LoadInst *LI) {
   const Value *Ptr = LI->getPointerOperand();
   X86AddressMode AM;
   if (!X86SelectAddress(Ptr, AM))
     return false;
 
   const X86InstrInfo &XII = (const X86InstrInfo &)TII;
 
   unsigned Size = DL.getTypeAllocSize(LI->getType());
   unsigned Alignment = LI->getAlignment();
 
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = DL.getABITypeAlignment(LI->getType());
 
   SmallVector<MachineOperand, 8> AddrOps;
   AM.getFullAddress(AddrOps);
 
   MachineInstr *Result =
     XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps,
                               Size, Alignment, /*AllowCommute=*/true);
   if (!Result)
     return false;
 
   Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI));
   FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
   MI->eraseFromParent();
   return true;
 }
 
 
 namespace llvm {
   FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo,
                                 const TargetLibraryInfo *libInfo) {
     return new X86FastISel(funcInfo, libInfo);
   }
 }
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
index 16aab16d63ee..f2eb6a8ea73e 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -1,2034 +1,1951 @@
 //===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains the X86 implementation of TargetFrameLowering class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86FrameLowering.h"
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Debug.h"
 #include <cstdlib>
 
 using namespace llvm;
 
 // FIXME: completely move here.
 extern cl::opt<bool> ForceStackAlign;
 
 bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  return !MF.getFrameInfo()->hasVarSizedObjects();
+  return !MF.getFrameInfo()->hasVarSizedObjects() &&
+         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+}
+
+/// canSimplifyCallFramePseudos - If there is a reserved call frame, the
+/// call frame pseudos can be simplified.  Having a FP, as in the default
+/// implementation, is not sufficient here since we can't always use it.
+/// Use a more nuanced condition.
+bool
+X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
+  const X86RegisterInfo *TRI = static_cast<const X86RegisterInfo *>
+                               (MF.getSubtarget().getRegisterInfo());
+  return hasReservedCallFrame(MF) ||
+         (hasFP(MF) && !TRI->needsStackRealignment(MF))
+         || TRI->hasBasePointer(MF);
+}
+
+// needsFrameIndexResolution - Do we need to perform FI resolution for
+// this function. Normally, this is required only when the function
+// has any stack objects. However, FI resolution actually has another job,
+// not apparent from the title - it resolves callframesetup/destroy 
+// that were not simplified earlier.
+// So, this is required for x86 functions that have push sequences even
+// when there are no stack objects.
+bool
+X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects() ||
+         MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
 }
 
 /// hasFP - Return true if the specified function should have a dedicated frame
 /// pointer register.  This is true if the function has variable sized allocas
 /// or if frame pointer elimination is disabled.
 bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const MachineModuleInfo &MMI = MF.getMMI();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           RegInfo->needsStackRealignment(MF) ||
           MFI->hasVarSizedObjects() ||
           MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
           MMI.callsUnwindInit() || MMI.callsEHReturn() ||
           MFI->hasStackMap() || MFI->hasPatchPoint());
 }
 
 static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
   if (IsLP64) {
     if (isInt<8>(Imm))
       return X86::SUB64ri8;
     return X86::SUB64ri32;
   } else {
     if (isInt<8>(Imm))
       return X86::SUB32ri8;
     return X86::SUB32ri;
   }
 }
 
 static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
   if (IsLP64) {
     if (isInt<8>(Imm))
       return X86::ADD64ri8;
     return X86::ADD64ri32;
   } else {
     if (isInt<8>(Imm))
       return X86::ADD32ri8;
     return X86::ADD32ri;
   }
 }
 
 static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
   if (IsLP64) {
     if (isInt<8>(Imm))
       return X86::AND64ri8;
     return X86::AND64ri32;
   }
   if (isInt<8>(Imm))
     return X86::AND32ri8;
   return X86::AND32ri;
 }
 
-static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) {
-  // We don't support LP64 for now.
-  assert(!IsLP64);
-
-  if (MO.isImm() && isInt<8>(MO.getImm()))
-    return X86::PUSH32i8;
-
-  return X86::PUSHi32;;
-}
-
 static unsigned getLEArOpcode(unsigned IsLP64) {
   return IsLP64 ? X86::LEA64r : X86::LEA32r;
 }
 
 /// findDeadCallerSavedReg - Return a caller-saved register that isn't live
 /// when it reaches the "return" instruction. We can then pop a stack object
 /// to this register without worry about clobbering it.
 static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator &MBBI,
                                        const TargetRegisterInfo &TRI,
                                        bool Is64Bit) {
   const MachineFunction *MF = MBB.getParent();
   const Function *F = MF->getFunction();
   if (!F || MF->getMMI().callsEHReturn())
     return 0;
 
   static const uint16_t CallerSavedRegs32Bit[] = {
     X86::EAX, X86::EDX, X86::ECX, 0
   };
 
   static const uint16_t CallerSavedRegs64Bit[] = {
     X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI,
     X86::R8,  X86::R9,  X86::R10, X86::R11, 0
   };
 
   unsigned Opc = MBBI->getOpcode();
   switch (Opc) {
   default: return 0;
   case X86::RETL:
   case X86::RETQ:
   case X86::RETIL:
   case X86::RETIQ:
   case X86::TCRETURNdi:
   case X86::TCRETURNri:
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
   case X86::TCRETURNmi64:
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {
     SmallSet<uint16_t, 8> Uses;
     for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MBBI->getOperand(i);
       if (!MO.isReg() || MO.isDef())
         continue;
       unsigned Reg = MO.getReg();
       if (!Reg)
         continue;
       for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
         Uses.insert(*AI);
     }
 
     const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit;
     for (; *CS; ++CS)
       if (!Uses.count(*CS))
         return *CS;
   }
   }
 
   return 0;
 }
 
 
 /// emitSPUpdate - Emit a series of instructions to increment / decrement the
 /// stack pointer by a constant value.
 static
 void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
                   unsigned StackPtr, int64_t NumBytes,
                   bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA,
                   const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) {
   bool isSub = NumBytes < 0;
   uint64_t Offset = isSub ? -NumBytes : NumBytes;
   unsigned Opc;
   if (UseLEA)
     Opc = getLEArOpcode(Is64BitStackPtr);
   else
     Opc = isSub
       ? getSUBriOpcode(Is64BitStackPtr, Offset)
       : getADDriOpcode(Is64BitStackPtr, Offset);
 
   uint64_t Chunk = (1LL << 31) - 1;
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
   while (Offset) {
     uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
     if (ThisVal == (Is64BitTarget ? 8 : 4)) {
       // Use push / pop instead.
       unsigned Reg = isSub
         ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX)
         : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget);
       if (Reg) {
         Opc = isSub
           ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r)
           : (Is64BitTarget ? X86::POP64r  : X86::POP32r);
         MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
           .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
         if (isSub)
           MI->setFlag(MachineInstr::FrameSetup);
         Offset -= ThisVal;
         continue;
       }
     }
 
     MachineInstr *MI = nullptr;
 
     if (UseLEA) {
       MI =  addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
                           StackPtr, false, isSub ? -ThisVal : ThisVal);
     } else {
       MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
             .addReg(StackPtr)
             .addImm(ThisVal);
       MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
     }
 
     if (isSub)
       MI->setFlag(MachineInstr::FrameSetup);
 
     Offset -= ThisVal;
   }
 }
 
 /// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
 static
 void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
                       unsigned StackPtr, uint64_t *NumBytes = nullptr) {
   if (MBBI == MBB.begin()) return;
 
   MachineBasicBlock::iterator PI = std::prev(MBBI);
   unsigned Opc = PI->getOpcode();
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
        Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
        Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
       PI->getOperand(0).getReg() == StackPtr) {
     if (NumBytes)
       *NumBytes += PI->getOperand(2).getImm();
     MBB.erase(PI);
   } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
               Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
              PI->getOperand(0).getReg() == StackPtr) {
     if (NumBytes)
       *NumBytes -= PI->getOperand(2).getImm();
     MBB.erase(PI);
   }
 }
 
 /// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower
 /// iterator.
 static
 void mergeSPUpdatesDown(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator &MBBI,
                         unsigned StackPtr, uint64_t *NumBytes = nullptr) {
   // FIXME:  THIS ISN'T RUN!!!
   return;
 
   if (MBBI == MBB.end()) return;
 
   MachineBasicBlock::iterator NI = std::next(MBBI);
   if (NI == MBB.end()) return;
 
   unsigned Opc = NI->getOpcode();
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
        Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
       NI->getOperand(0).getReg() == StackPtr) {
     if (NumBytes)
       *NumBytes -= NI->getOperand(2).getImm();
     MBB.erase(NI);
     MBBI = NI;
   } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
               Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
              NI->getOperand(0).getReg() == StackPtr) {
     if (NumBytes)
       *NumBytes += NI->getOperand(2).getImm();
     MBB.erase(NI);
     MBBI = NI;
   }
 }
 
 /// mergeSPUpdates - Checks the instruction before/after the passed
 /// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and
 /// the stack adjustment is returned as a positive value for ADD/LEA and a
 /// negative for SUB.
 static int mergeSPUpdates(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator &MBBI, unsigned StackPtr,
                           bool doMergeWithPrevious) {
   if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
       (!doMergeWithPrevious && MBBI == MBB.end()))
     return 0;
 
   MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
   MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
                                                        : std::next(MBBI);
   unsigned Opc = PI->getOpcode();
   int Offset = 0;
 
   if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
        Opc == X86::ADD32ri || Opc == X86::ADD32ri8 ||
        Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
       PI->getOperand(0).getReg() == StackPtr){
     Offset += PI->getOperand(2).getImm();
     MBB.erase(PI);
     if (!doMergeWithPrevious) MBBI = NI;
   } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
               Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
              PI->getOperand(0).getReg() == StackPtr) {
     Offset -= PI->getOperand(2).getImm();
     MBB.erase(PI);
     if (!doMergeWithPrevious) MBBI = NI;
   }
 
   return Offset;
 }
 
 static bool isEAXLiveIn(MachineFunction &MF) {
   for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(),
        EE = MF.getRegInfo().livein_end(); II != EE; ++II) {
     unsigned Reg = II->first;
 
     if (Reg == X86::EAX || Reg == X86::AX ||
         Reg == X86::AH || Reg == X86::AL)
       return true;
   }
 
   return false;
 }
 
 void
 X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                             MachineBasicBlock::iterator MBBI,
                                             DebugLoc DL) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
 
   // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   if (CSI.empty()) return;
 
   // Calculate offsets.
   for (std::vector<CalleeSavedInfo>::const_iterator
          I = CSI.begin(), E = CSI.end(); I != E; ++I) {
     int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
     unsigned Reg = I->getReg();
 
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
     unsigned CFIIndex =
         MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg,
                                                         Offset));
     BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
   }
 }
 
 /// usesTheStack - This function checks if any of the users of EFLAGS
 /// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
 /// to use the stack, and if we don't adjust the stack we clobber the first
 /// frame index.
 /// See X86InstrInfo::copyPhysReg.
 static bool usesTheStack(const MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   for (MachineRegisterInfo::reg_instr_iterator
        ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end();
        ri != re; ++ri)
     if (ri->isCopy())
       return true;
 
   return false;
 }
 
 void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI,
                                              unsigned &CallOp,
                                              const char *&Symbol) {
   CallOp = STI.is64Bit() ? X86::W64ALLOCA : X86::CALLpcrel32;
 
   if (STI.is64Bit()) {
     if (STI.isTargetCygMing()) {
       Symbol = "___chkstk_ms";
     } else {
       Symbol = "__chkstk";
     }
   } else if (STI.isTargetCygMing())
     Symbol = "_alloca";
   else
     Symbol = "_chkstk";
 }
 
 /// emitPrologue - Push callee-saved registers onto the stack, which
 /// automatically adjust the stack pointer. Adjust the stack pointer to allocate
 /// space for local variables. Also emit labels used by the exception handler to
 /// generate the exception handling frames.
 
 /*
   Here's a gist of what gets emitted:
 
   ; Establish frame pointer, if needed
   [if needs FP]
       push  %rbp
       .cfi_def_cfa_offset 16
       .cfi_offset %rbp, -16
       .seh_pushreg %rpb
       mov  %rsp, %rbp
       .cfi_def_cfa_register %rbp
 
   ; Spill general-purpose registers
   [for all callee-saved GPRs]
       pushq %<reg>
       [if not needs FP]
          .cfi_def_cfa_offset (offset from RETADDR)
       .seh_pushreg %<reg>
 
   ; If the required stack alignment > default stack alignment
   ; rsp needs to be re-aligned.  This creates a "re-alignment gap"
   ; of unknown size in the stack frame.
   [if stack needs re-alignment]
       and  $MASK, %rsp
 
   ; Allocate space for locals
   [if target is Windows and allocated space > 4096 bytes]
       ; Windows needs special care for allocations larger
       ; than one page.
       mov $NNN, %rax
       call ___chkstk_ms/___chkstk
       sub  %rax, %rsp
   [else]
       sub  $NNN, %rsp
 
   [if needs FP]
       .seh_stackalloc (size of XMM spill slots)
       .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots
   [else]
       .seh_stackalloc NNN
 
   ; Spill XMMs
   ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved,
   ; they may get spilled on any platform, if the current function
   ; calls @llvm.eh.unwind.init
   [if needs FP]
       [for all callee-saved XMM registers]
           movaps  %<xmm reg>, -MMM(%rbp)
       [for all callee-saved XMM registers]
           .seh_savexmm %<xmm reg>, (-MMM + SEHFrameOffset)
               ; i.e. the offset relative to (%rbp - SEHFrameOffset)
   [else]
       [for all callee-saved XMM registers]
           movaps  %<xmm reg>, KKK(%rsp)
       [for all callee-saved XMM registers]
           .seh_savexmm %<xmm reg>, KKK
 
   .seh_endprologue
 
   [if needs base pointer]
       mov  %rsp, %rbx
       [if needs to restore base pointer]
           mov %rsp, -MMM(%rbp)
 
   ; Emit CFI info
   [if needs FP]
       [for all callee-saved registers]
           .cfi_offset %<reg>, (offset from %rbp)
   [else]
        .cfi_def_cfa_offset (offset from RETADDR)
       [for all callee-saved registers]
           .cfi_offset %<reg>, (offset from %rsp)
 
   Notes:
   - .seh directives are emitted only for Windows 64 ABI
   - .cfi directives are emitted for all other ABIs
   - for 32-bit code, substitute %e?? registers for %r??
 */
 
 void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *Fn = MF.getFunction();
   const X86RegisterInfo *RegInfo =
       static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
   uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
   bool HasFP = hasFP(MF);
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
   const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
   bool IsWin64 = STI.isTargetWin64();
   // Not necessarily synonymous with IsWin64.
   bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry();
   bool NeedsDwarfCFI =
       !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
   bool UseLEA = STI.useLeaForSP();
   unsigned StackAlign = getStackAlignment();
   unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
   const unsigned MachineFramePtr = STI.isTarget64BitILP32() ?
                  getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr;
   unsigned StackPtr = RegInfo->getStackRegister();
   unsigned BasePtr = RegInfo->getBaseRegister();
   DebugLoc DL;
 
   // If we're forcing a stack realignment we can't rely on just the frame
   // info, we need to know the ABI stack alignment as well in case we
   // have a call out.  Otherwise just make sure we have some alignment - we'll
   // go with the minimum SlotSize.
   if (ForceStackAlign) {
     if (MFI->hasCalls())
       MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
     else if (MaxAlign < SlotSize)
       MaxAlign = SlotSize;
   }
 
   // Add RETADDR move area to callee saved frame size.
   int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
   if (TailCallReturnAddrDelta < 0)
     X86FI->setCalleeSavedFrameSize(
       X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
   bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO());
 
   // The default stack probe size is 4096 if the function has no stackprobesize
   // attribute.
   unsigned StackProbeSize = 4096;
   if (Fn->hasFnAttribute("stack-probe-size"))
     Fn->getFnAttribute("stack-probe-size")
         .getValueAsString()
         .getAsInteger(0, StackProbeSize);
 
   // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
   // function, and use up to 128 bytes of stack space, don't have a frame
   // pointer, calls, or dynamic alloca then we do not need to adjust the
   // stack pointer (we fit in the Red Zone). We also check that we don't
   // push and pop from the stack.
   if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                                    Attribute::NoRedZone) &&
       !RegInfo->needsStackRealignment(MF) &&
       !MFI->hasVarSizedObjects() &&                     // No dynamic alloca.
       !MFI->adjustsStack() &&                           // No calls.
       !IsWin64 &&                                       // Win64 has no Red Zone
       !usesTheStack(MF) &&                              // Don't push and pop.
       !MF.shouldSplitStack()) {                         // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
     MFI->setStackSize(StackSize);
   }
 
   // Insert stack pointer adjustment for later moving of return addr.  Only
   // applies to tail call optimized functions where the callee argument stack
   // size is bigger than the callers.
   if (TailCallReturnAddrDelta < 0) {
     MachineInstr *MI =
       BuildMI(MBB, MBBI, DL,
               TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)),
               StackPtr)
         .addReg(StackPtr)
         .addImm(-TailCallReturnAddrDelta)
         .setMIFlag(MachineInstr::FrameSetup);
     MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
   }
 
   // Mapping for machine moves:
   //
   //   DST: VirtualFP AND
   //        SRC: VirtualFP              => DW_CFA_def_cfa_offset
   //        ELSE                        => DW_CFA_def_cfa
   //
   //   SRC: VirtualFP AND
   //        DST: Register               => DW_CFA_def_cfa_register
   //
   //   ELSE
   //        OFFSET < 0                  => DW_CFA_offset_extended_sf
   //        REG < 64                    => DW_CFA_offset + Reg
   //        ELSE                        => DW_CFA_offset_extended
 
   uint64_t NumBytes = 0;
   int stackGrowth = -SlotSize;
 
   if (HasFP) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
     // If required, include space for extra hidden slot for stashing base pointer.
     if (X86FI->getRestoreBasePointer())
       FrameSize += SlotSize;
     if (RegInfo->needsStackRealignment(MF)) {
       // Callee-saved registers are pushed on stack before the stack
       // is realigned.
       FrameSize -= X86FI->getCalleeSavedFrameSize();
       NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
     } else {
       NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
     }
 
     // Get the offset of the stack slot for the EBP register, which is
     // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
     // Update the frame offset adjustment.
     MFI->setOffsetAdjustment(-NumBytes);
 
     // Save EBP/RBP into the appropriate stack slot.
     BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
       .addReg(MachineFramePtr, RegState::Kill)
       .setMIFlag(MachineInstr::FrameSetup);
 
     if (NeedsDwarfCFI) {
       // Mark the place where EBP/RBP was saved.
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
       BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
 
       // Change the rule for the FramePtr to be an "offset" rule.
       unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr,
                                          DwarfFramePtr, 2 * stackGrowth));
       BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
     if (NeedsWinEH) {
       BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
           .addImm(FramePtr)
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
     // Update EBP with the new base value.
     BuildMI(MBB, MBBI, DL,
             TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr)
         .addReg(StackPtr)
         .setMIFlag(MachineInstr::FrameSetup);
 
     if (NeedsDwarfCFI) {
       // Mark effective beginning of when frame pointer becomes valid.
       // Define the current CFA to use the EBP/RBP register.
       unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
       BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
     // Mark the FramePtr as live-in in every block.
     for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
       I->addLiveIn(MachineFramePtr);
   } else {
     NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
   }
 
   // Skip the callee-saved push instructions.
   bool PushedRegs = false;
   int StackOffset = 2 * stackGrowth;
 
   while (MBBI != MBB.end() &&
          (MBBI->getOpcode() == X86::PUSH32r ||
           MBBI->getOpcode() == X86::PUSH64r)) {
     PushedRegs = true;
     unsigned Reg = MBBI->getOperand(0).getReg();
     ++MBBI;
 
     if (!HasFP && NeedsDwarfCFI) {
       // Mark callee-saved push instruction.
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
       BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
       StackOffset += stackGrowth;
     }
 
     if (NeedsWinEH) {
       BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
           MachineInstr::FrameSetup);
     }
   }
 
   // Realign stack after we pushed callee-saved registers (so that we'll be
   // able to calculate their offsets from the frame pointer).
   if (RegInfo->needsStackRealignment(MF)) {
     assert(HasFP && "There should be a frame pointer if stack is realigned.");
     uint64_t Val = -MaxAlign;
     MachineInstr *MI =
       BuildMI(MBB, MBBI, DL,
               TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr)
       .addReg(StackPtr)
       .addImm(Val)
       .setMIFlag(MachineInstr::FrameSetup);
 
     // The EFLAGS implicit def is dead.
     MI->getOperand(3).setIsDead();
   }
 
   // If there is an SUB32ri of ESP immediately before this instruction, merge
   // the two. This can be the case when tail call elimination is enabled and
   // the callee has more arguments then the caller.
   NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
 
   // If there is an ADD32ri or SUB32ri of ESP immediately after this
   // instruction, merge the two instructions.
   mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
 
   // Adjust stack pointer: ESP -= numbytes.
 
   // Windows and cygwin/mingw require a prologue helper routine when allocating
   // more than 4K bytes on the stack.  Windows uses __chkstk and cygwin/mingw
   // uses __alloca.  __alloca and the 32-bit version of __chkstk will probe the
   // stack and adjust the stack pointer in one go.  The 64-bit version of
   // __chkstk is only responsible for probing the stack.  The 64-bit prologue is
   // responsible for adjusting the stack pointer.  Touching the stack at 4K
   // increments is necessary to ensure that the guard pages used by the OS
   // virtual memory manager are allocated in correct sequence.
   if (NumBytes >= StackProbeSize && UseStackProbe) {
     const char *StackProbeSymbol;
     unsigned CallOp;
 
     getStackProbeFunction(STI, CallOp, StackProbeSymbol);
 
     // Check whether EAX is livein for this function.
     bool isEAXAlive = isEAXLiveIn(MF);
 
     if (isEAXAlive) {
       // Sanity check that EAX is not livein for this function.
       // It should not be, so throw an assert.
       assert(!Is64Bit && "EAX is livein in x64 case!");
 
       // Save EAX
       BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
         .addReg(X86::EAX, RegState::Kill)
         .setMIFlag(MachineInstr::FrameSetup);
     }
 
     if (Is64Bit) {
       // Handle the 64-bit Windows ABI case where we need to call __chkstk.
       // Function prologue is responsible for adjusting the stack pointer.
       BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX)
         .addImm(NumBytes)
         .setMIFlag(MachineInstr::FrameSetup);
     } else {
       // Allocate NumBytes-4 bytes on stack in case of isEAXAlive.
       // We'll also use 4 already allocated bytes for EAX.
       BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
         .addImm(isEAXAlive ? NumBytes - 4 : NumBytes)
         .setMIFlag(MachineInstr::FrameSetup);
     }
 
     BuildMI(MBB, MBBI, DL,
             TII.get(CallOp))
       .addExternalSymbol(StackProbeSymbol)
       .addReg(StackPtr,    RegState::Define | RegState::Implicit)
       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
       .setMIFlag(MachineInstr::FrameSetup);
 
     if (Is64Bit) {
       // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp
       // themself. It also does not clobber %rax so we can reuse it when
       // adjusting %rsp.
       BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr)
         .addReg(StackPtr)
         .addReg(X86::RAX)
         .setMIFlag(MachineInstr::FrameSetup);
     }
     if (isEAXAlive) {
       // Restore EAX
       MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
                                               X86::EAX),
                                       StackPtr, false, NumBytes - 4);
       MI->setFlag(MachineInstr::FrameSetup);
       MBB.insert(MBBI, MI);
     }
   } else if (NumBytes) {
     emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr,
                  UseLEA, TII, *RegInfo);
   }
 
   int SEHFrameOffset = 0;
   if (NeedsWinEH) {
     if (HasFP) {
       // We need to set frame base offset low enough such that all saved
       // register offsets would be positive relative to it, but we can't
       // just use NumBytes, because .seh_setframe offset must be <=240.
       // So we pretend to have only allocated enough space to spill the
       // non-volatile registers.
       // We don't care about the rest of stack allocation, because unwinder
       // will restore SP to (BP - SEHFrameOffset)
       for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
         int offset = MFI->getObjectOffset(Info.getFrameIdx());
         SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset));
       }
       SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant
 
       // This only needs to account for XMM spill slots, GPR slots
       // are covered by the .seh_pushreg's emitted above.
       unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize();
       if (Size) {
         BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
             .addImm(Size)
             .setMIFlag(MachineInstr::FrameSetup);
       }
 
       BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
           .addImm(FramePtr)
           .addImm(SEHFrameOffset)
           .setMIFlag(MachineInstr::FrameSetup);
     } else {
       // SP will be the base register for restoring XMMs
       if (NumBytes) {
         BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc))
             .addImm(NumBytes)
             .setMIFlag(MachineInstr::FrameSetup);
       }
     }
   }
 
   // Skip the rest of register spilling code
   while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
     ++MBBI;
 
   // Emit SEH info for non-GPRs
   if (NeedsWinEH) {
     for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) {
       unsigned Reg = Info.getReg();
       if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
         continue;
       assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class");
 
       int Offset = getFrameIndexOffset(MF, Info.getFrameIdx());
       Offset += SEHFrameOffset;
 
       BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
           .addImm(Reg)
           .addImm(Offset)
           .setMIFlag(MachineInstr::FrameSetup);
     }
 
     BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue))
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
   // If we need a base pointer, set it up here. It's whatever the value
   // of the stack pointer is at this point. Any variable size objects
   // will be allocated after this, so we can still use the base pointer
   // to reference locals.
   if (RegInfo->hasBasePointer(MF)) {
     // Update the base pointer with the current stack pointer.
     unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr;
     BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr)
       .addReg(StackPtr)
       .setMIFlag(MachineInstr::FrameSetup);
     if (X86FI->getRestoreBasePointer()) {
       // Stash value of base pointer.  Saving RSP instead of EBP shortens dependence chain.
       unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)),
                    FramePtr, true, X86FI->getRestoreBasePointerOffset())
         .addReg(StackPtr)
         .setMIFlag(MachineInstr::FrameSetup);
     }
   }
 
   if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) {
     // Mark end of stack pointer adjustment.
     if (!HasFP && NumBytes) {
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr,
                                                -StackSize + stackGrowth));
 
       BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
     // Emit DWARF info specifying the offsets of the callee-saved registers.
     if (PushedRegs)
       emitCalleeSavedFrameMoves(MBB, MBBI, DL);
   }
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   const X86RegisterInfo *RegInfo =
       static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   assert(MBBI != MBB.end() && "Returning block has no instructions");
   unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc DL = MBBI->getDebugLoc();
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit.
   const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64();
   const bool Is64BitILP32 = STI.isTarget64BitILP32();
   bool UseLEA = STI.useLeaForSP();
   unsigned StackAlign = getStackAlignment();
   unsigned SlotSize = RegInfo->getSlotSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
   unsigned MachineFramePtr = Is64BitILP32 ?
              getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr;
   unsigned StackPtr = RegInfo->getStackRegister();
 
   bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
   bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry();
 
   switch (RetOpcode) {
   default:
     llvm_unreachable("Can only insert epilog into returning blocks");
   case X86::RETQ:
   case X86::RETL:
   case X86::RETIL:
   case X86::RETIQ:
   case X86::TCRETURNdi:
   case X86::TCRETURNri:
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
   case X86::TCRETURNmi64:
   case X86::EH_RETURN:
   case X86::EH_RETURN64:
     break;  // These are ok
   }
 
   // Get the number of bytes to allocate from the FrameInfo.
   uint64_t StackSize = MFI->getStackSize();
   uint64_t MaxAlign  = MFI->getMaxAlignment();
   unsigned CSSize = X86FI->getCalleeSavedFrameSize();
   uint64_t NumBytes = 0;
 
   // If we're forcing a stack realignment we can't rely on just the frame
   // info, we need to know the ABI stack alignment as well in case we
   // have a call out.  Otherwise just make sure we have some alignment - we'll
   // go with the minimum.
   if (ForceStackAlign) {
     if (MFI->hasCalls())
       MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
     else
       MaxAlign = MaxAlign ? MaxAlign : 4;
   }
 
   if (hasFP(MF)) {
     // Calculate required stack adjustment.
     uint64_t FrameSize = StackSize - SlotSize;
     if (RegInfo->needsStackRealignment(MF)) {
       // Callee-saved registers were pushed on stack before the stack
       // was realigned.
       FrameSize -= CSSize;
       NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
     } else {
       NumBytes = FrameSize - CSSize;
     }
 
     // Pop EBP.
     BuildMI(MBB, MBBI, DL,
             TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr);
   } else {
     NumBytes = StackSize - CSSize;
   }
 
   // Skip the callee-saved pop instructions.
   while (MBBI != MBB.begin()) {
     MachineBasicBlock::iterator PI = std::prev(MBBI);
     unsigned Opc = PI->getOpcode();
 
     if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE &&
         !PI->isTerminator())
       break;
 
     --MBBI;
   }
   MachineBasicBlock::iterator FirstCSPop = MBBI;
 
   DL = MBBI->getDebugLoc();
 
   // If there is an ADD32ri or SUB32ri of ESP immediately before this
   // instruction, merge the two instructions.
   if (NumBytes || MFI->hasVarSizedObjects())
     mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
 
   // If dynamic alloca is used, then reset esp to point to the last callee-saved
   // slot before popping them off! Same applies for the case, when stack was
   // realigned.
   if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) {
     if (RegInfo->needsStackRealignment(MF))
       MBBI = FirstCSPop;
     if (CSSize != 0) {
       unsigned Opc = getLEArOpcode(Uses64BitFramePtr);
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
                    FramePtr, false, -CSSize);
       --MBBI;
     } else {
       unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr);
       BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
         .addReg(FramePtr);
       --MBBI;
     }
   } else if (NumBytes) {
     // Adjust stack pointer back: ESP += numbytes.
     emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA,
                  TII, *RegInfo);
     --MBBI;
   }
 
   // Windows unwinder will not invoke function's exception handler if IP is
   // either in prologue or in epilogue.  This behavior causes a problem when a
   // call immediately precedes an epilogue, because the return address points
   // into the epilogue.  To cope with that, we insert an epilogue marker here,
   // then replace it with a 'nop' if it ends up immediately after a CALL in the
   // final emitted code.
   if (NeedsWinEH)
     BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
 
   // We're returning from function via eh_return.
   if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
     MBBI = MBB.getLastNonDebugInstr();
     MachineOperand &DestAddr  = MBBI->getOperand(0);
     assert(DestAddr.isReg() && "Offset should be in register!");
     BuildMI(MBB, MBBI, DL,
             TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr),
             StackPtr).addReg(DestAddr.getReg());
   } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
              RetOpcode == X86::TCRETURNmi ||
              RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 ||
              RetOpcode == X86::TCRETURNmi64) {
     bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64;
     // Tail call return: adjust the stack pointer and jump to callee.
     MBBI = MBB.getLastNonDebugInstr();
     MachineOperand &JumpTarget = MBBI->getOperand(0);
     MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
     assert(StackAdjust.isImm() && "Expecting immediate value.");
 
     // Adjust stack pointer.
     int StackAdj = StackAdjust.getImm();
     int MaxTCDelta = X86FI->getTCReturnAddrDelta();
     int Offset = 0;
     assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
 
     // Incoporate the retaddr area.
     Offset = StackAdj-MaxTCDelta;
     assert(Offset >= 0 && "Offset should never be negative");
 
     if (Offset) {
       // Check for possible merge with preceding ADD instruction.
       Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
       emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr,
                    UseLEA, TII, *RegInfo);
     }
 
     // Jump to label or value in register.
     if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) {
       MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi)
                                        ? X86::TAILJMPd : X86::TAILJMPd64));
       if (JumpTarget.isGlobal())
         MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
                              JumpTarget.getTargetFlags());
       else {
         assert(JumpTarget.isSymbol());
         MIB.addExternalSymbol(JumpTarget.getSymbolName(),
                               JumpTarget.getTargetFlags());
       }
     } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) {
       MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi)
                                        ? X86::TAILJMPm : X86::TAILJMPm64));
       for (unsigned i = 0; i != 5; ++i)
         MIB.addOperand(MBBI->getOperand(i));
     } else if (RetOpcode == X86::TCRETURNri64) {
       BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)).
         addReg(JumpTarget.getReg(), RegState::Kill);
     } else {
       BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)).
         addReg(JumpTarget.getReg(), RegState::Kill);
     }
 
     MachineInstr *NewMI = std::prev(MBBI);
     NewMI->copyImplicitOps(MF, MBBI);
 
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
   } else if ((RetOpcode == X86::RETQ || RetOpcode == X86::RETL ||
               RetOpcode == X86::RETIQ || RetOpcode == X86::RETIL) &&
              (X86FI->getTCReturnAddrDelta() < 0)) {
     // Add the return addr area delta back since we are not tail calling.
     int delta = -1*X86FI->getTCReturnAddrDelta();
     MBBI = MBB.getLastNonDebugInstr();
 
     // Check for possible merge with preceding ADD instruction.
     delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
     emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII,
                  *RegInfo);
   }
 }
 
 int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
                                           int FI) const {
   const X86RegisterInfo *RegInfo =
       static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
   uint64_t StackSize = MFI->getStackSize();
 
   if (RegInfo->hasBasePointer(MF)) {
     assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!");
     if (FI < 0) {
       // Skip the saved EBP.
       return Offset + RegInfo->getSlotSize();
     } else {
       assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
       return Offset + StackSize;
     }
   } else if (RegInfo->needsStackRealignment(MF)) {
     if (FI < 0) {
       // Skip the saved EBP.
       return Offset + RegInfo->getSlotSize();
     } else {
       assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0);
       return Offset + StackSize;
     }
     // FIXME: Support tail calls
   } else {
     if (!hasFP(MF))
       return Offset + StackSize;
 
     // Skip the saved EBP.
     Offset += RegInfo->getSlotSize();
 
     // Skip the RETADDR move area
     const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
     int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
     if (TailCallReturnAddrDelta < 0)
       Offset -= TailCallReturnAddrDelta;
   }
 
   return Offset;
 }
 
 int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
                                              unsigned &FrameReg) const {
   const X86RegisterInfo *RegInfo =
       static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   // We can't calculate offset from frame pointer if the stack is realigned,
   // so enforce usage of stack/base pointer.  The base pointer is used when we
   // have dynamic allocas in addition to dynamic realignment.
   if (RegInfo->hasBasePointer(MF))
     FrameReg = RegInfo->getBaseRegister();
   else if (RegInfo->needsStackRealignment(MF))
     FrameReg = RegInfo->getStackRegister();
   else
     FrameReg = RegInfo->getFrameRegister(MF);
   return getFrameIndexOffset(MF, FI);
 }
 
 // Simplified from getFrameIndexOffset keeping only StackPointer cases
 int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   // Does not include any dynamic realign.
   const uint64_t StackSize = MFI->getStackSize();
   {
 #ifndef NDEBUG
     const X86RegisterInfo *RegInfo =
       static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo());
     // Note: LLVM arranges the stack as:
     // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP)
     //      > "Stack Slots" (<--SP)
     // We can always address StackSlots from RSP.  We can usually (unless
     // needsStackRealignment) address CSRs from RSP, but sometimes need to
     // address them from RBP.  FixedObjects can be placed anywhere in the stack
     // frame depending on their specific requirements (i.e. we can actually
     // refer to arguments to the function which are stored in the *callers*
     // frame).  As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs
     // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject.
 
     assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case");
 
     // We don't handle tail calls, and shouldn't be seeing them
     // either.
     int TailCallReturnAddrDelta =
         MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta();
     assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!");
 #endif
   }
 
   // This is how the math works out:
   //
   //  %rsp grows (i.e. gets lower) left to right. Each box below is
   //  one word (eight bytes).  Obj0 is the stack slot we're trying to
   //  get to.
   //
   //    ----------------------------------
   //    | BP | Obj0 | Obj1 | ... | ObjN |
   //    ----------------------------------
   //    ^    ^      ^                   ^
   //    A    B      C                   E
   //
   // A is the incoming stack pointer.
   // (B - A) is the local area offset (-8 for x86-64) [1]
   // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2]
   //
   // |(E - B)| is the StackSize (absolute value, positive).  For a
   // stack that grown down, this works out to be (B - E). [3]
   //
   // E is also the value of %rsp after stack has been set up, and we
   // want (C - E) -- the value we can add to %rsp to get to Obj0.  Now
   // (C - E) == (C - A) - (B - A) + (B - E)
   //            { Using [1], [2] and [3] above }
   //         == getObjectOffset - LocalAreaOffset + StackSize
   //
 
   // Get the Offset from the StackPointer
   int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea();
 
   return Offset + StackSize;
 }
 // Simplified from getFrameIndexReference keeping only StackPointer cases
 int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
                                                   unsigned &FrameReg) const {
   const X86RegisterInfo *RegInfo =
     static_cast<const X86RegisterInfo*>(MF.getSubtarget().getRegisterInfo());
 
   assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case");
 
   FrameReg = RegInfo->getStackRegister();
   return getFrameIndexOffsetFromSP(MF, FI);
 }
 
 bool X86FrameLowering::assignCalleeSavedSpillSlots(
     MachineFunction &MF, const TargetRegisterInfo *TRI,
     std::vector<CalleeSavedInfo> &CSI) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const X86RegisterInfo *RegInfo =
       static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   unsigned SlotSize = RegInfo->getSlotSize();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 
   unsigned CalleeSavedFrameSize = 0;
   int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
 
   if (hasFP(MF)) {
     // emitPrologue always spills frame register the first thing.
     SpillSlotOffset -= SlotSize;
     MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
 
     // Since emitPrologue and emitEpilogue will handle spilling and restoring of
     // the frame register, we can delete it from CSI list and not have to worry
     // about avoiding it later.
     unsigned FPReg = RegInfo->getFrameRegister(MF);
     for (unsigned i = 0; i < CSI.size(); ++i) {
       if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) {
         CSI.erase(CSI.begin() + i);
         break;
       }
     }
   }
 
   // Assign slots for GPRs. It increases frame size.
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i - 1].getReg();
 
     if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
       continue;
 
     SpillSlotOffset -= SlotSize;
     CalleeSavedFrameSize += SlotSize;
 
     int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
     CSI[i - 1].setFrameIdx(SlotIndex);
   }
 
   X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
 
   // Assign slots for XMMs.
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i - 1].getReg();
     if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
       continue;
 
     const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
     // ensure alignment
     SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment();
     // spill into slot
     SpillSlotOffset -= RC->getSize();
     int SlotIndex =
         MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset);
     CSI[i - 1].setFrameIdx(SlotIndex);
     MFI->ensureMaxAlignment(RC->getAlignment());
   }
 
   return true;
 }
 
 bool X86FrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
     const std::vector<CalleeSavedInfo> &CSI,
     const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 
   // Push GPRs. It increases frame size.
   unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i - 1].getReg();
 
     if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
       continue;
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
 
     BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill)
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
   // Make XMM regs spilled. X86 does not have ability of push/pop XMM.
   // It can be done by spilling XMMs to stack frame.
   for (unsigned i = CSI.size(); i != 0; --i) {
     unsigned Reg = CSI[i-1].getReg();
     if (X86::GR64RegClass.contains(Reg) ||
         X86::GR32RegClass.contains(Reg))
       continue;
     // Add the callee-saved register as live-in. It's killed at the spill.
     MBB.addLiveIn(Reg);
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
 
     TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC,
                             TRI);
     --MI;
     MI->setFlag(MachineInstr::FrameSetup);
     ++MI;
   }
 
   return true;
 }
 
 bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator MI,
                                         const std::vector<CalleeSavedInfo> &CSI,
                                           const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
 
   // Reload XMMs from stack frame.
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
     if (X86::GR64RegClass.contains(Reg) ||
         X86::GR32RegClass.contains(Reg))
       continue;
 
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
     TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI);
   }
 
   // POP GPRs.
   unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
     if (!X86::GR64RegClass.contains(Reg) &&
         !X86::GR32RegClass.contains(Reg))
       continue;
 
     BuildMI(MBB, MI, DL, TII.get(Opc), Reg);
   }
   return true;
 }
 
 void
 X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                        RegScavenger *RS) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const X86RegisterInfo *RegInfo =
       static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo());
   unsigned SlotSize = RegInfo->getSlotSize();
 
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
 
   if (TailCallReturnAddrDelta < 0) {
     // create RETURNADDR area
     //   arg
     //   arg
     //   RETADDR
     //   { ...
     //     RETADDR area
     //     ...
     //   }
     //   [EBP]
     MFI->CreateFixedObject(-TailCallReturnAddrDelta,
                            TailCallReturnAddrDelta - SlotSize, true);
   }
 
   // Spill the BasePtr if it's used.
   if (RegInfo->hasBasePointer(MF))
     MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
 }
 
 static bool
 HasNestArgument(const MachineFunction *MF) {
   const Function *F = MF->getFunction();
   for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
        I != E; I++) {
     if (I->hasNestAttr())
       return true;
   }
   return false;
 }
 
 /// GetScratchRegister - Get a temp register for performing work in the
 /// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
 /// and the properties of the function either one or two registers will be
 /// needed. Set primary to true for the first register, false for the second.
 static unsigned
 GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
   CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
 
   // Erlang stuff.
   if (CallingConvention == CallingConv::HiPE) {
     if (Is64Bit)
       return Primary ? X86::R14 : X86::R13;
     else
       return Primary ? X86::EBX : X86::EDI;
   }
 
   if (Is64Bit) {
     if (IsLP64)
       return Primary ? X86::R11 : X86::R12;
     else
       return Primary ? X86::R11D : X86::R12D;
   }
 
   bool IsNested = HasNestArgument(&MF);
 
   if (CallingConvention == CallingConv::X86_FastCall ||
       CallingConvention == CallingConv::Fast) {
     if (IsNested)
       report_fatal_error("Segmented stacks does not support fastcall with "
                          "nested function.");
     return Primary ? X86::EAX : X86::ECX;
   }
   if (IsNested)
     return Primary ? X86::EDX : X86::EAX;
   return Primary ? X86::ECX : X86::EAX;
 }
 
 // The stack limit in the TCB is set to this many bytes above the actual stack
 // limit.
 static const uint64_t kSplitStackAvailable = 256;
 
 void
 X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MachineBasicBlock &prologueMBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   uint64_t StackSize;
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool Is64Bit = STI.is64Bit();
   const bool IsLP64 = STI.isTarget64BitLP64();
   unsigned TlsReg, TlsOffset;
   DebugLoc DL;
 
   unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
   assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
          "Scratch register is live-in");
 
   if (MF.getFunction()->isVarArg())
     report_fatal_error("Segmented stacks do not support vararg functions.");
   if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
       !STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
       !STI.isTargetDragonFly())
     report_fatal_error("Segmented stacks not supported on this platform.");
 
   // Eventually StackSize will be calculated by a link-time pass; which will
   // also decide whether checking code needs to be injected into this particular
   // prologue.
   StackSize = MFI->getStackSize();
 
   // Do not generate a prologue for functions with a stack of size zero
   if (StackSize == 0)
     return;
 
   MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
   MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   bool IsNested = false;
 
   // We need to know if the function has a nest argument only in 64 bit mode.
   if (Is64Bit)
     IsNested = HasNestArgument(&MF);
 
   // The MOV R10, RAX needs to be in a different block, since the RET we emit in
   // allocMBB needs to be last (terminating) instruction.
 
   for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(),
          e = prologueMBB.livein_end(); i != e; i++) {
     allocMBB->addLiveIn(*i);
     checkMBB->addLiveIn(*i);
   }
 
   if (IsNested)
     allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D);
 
   MF.push_front(allocMBB);
   MF.push_front(checkMBB);
 
   // When the frame size is less than 256 we just compare the stack
   // boundary directly to the value of the stack pointer, per gcc.
   bool CompareStackPointer = StackSize < kSplitStackAvailable;
 
   // Read the limit off the current stacklet off the stack_guard location.
   if (Is64Bit) {
     if (STI.isTargetLinux()) {
       TlsReg = X86::FS;
       TlsOffset = IsLP64 ? 0x70 : 0x40;
     } else if (STI.isTargetDarwin()) {
       TlsReg = X86::GS;
       TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
     } else if (STI.isTargetWin64()) {
       TlsReg = X86::GS;
       TlsOffset = 0x28; // pvArbitrary, reserved for application use
     } else if (STI.isTargetFreeBSD()) {
       TlsReg = X86::FS;
       TlsOffset = 0x18;
     } else if (STI.isTargetDragonFly()) {
       TlsReg = X86::FS;
       TlsOffset = 0x20; // use tls_tcb.tcb_segstack
     } else {
       report_fatal_error("Segmented stacks not supported on this platform.");
     }
 
     if (CompareStackPointer)
       ScratchReg = IsLP64 ? X86::RSP : X86::ESP;
     else
       BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP)
         .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
 
     BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg)
       .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
   } else {
     if (STI.isTargetLinux()) {
       TlsReg = X86::GS;
       TlsOffset = 0x30;
     } else if (STI.isTargetDarwin()) {
       TlsReg = X86::GS;
       TlsOffset = 0x48 + 90*4;
     } else if (STI.isTargetWin32()) {
       TlsReg = X86::FS;
       TlsOffset = 0x14; // pvArbitrary, reserved for application use
     } else if (STI.isTargetDragonFly()) {
       TlsReg = X86::FS;
       TlsOffset = 0x10; // use tls_tcb.tcb_segstack
     } else if (STI.isTargetFreeBSD()) {
       report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
     } else {
       report_fatal_error("Segmented stacks not supported on this platform.");
     }
 
     if (CompareStackPointer)
       ScratchReg = X86::ESP;
     else
       BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
         .addImm(1).addReg(0).addImm(-StackSize).addReg(0);
 
     if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() ||
         STI.isTargetDragonFly()) {
       BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
         .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
     } else if (STI.isTargetDarwin()) {
 
       // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
       unsigned ScratchReg2;
       bool SaveScratch2;
       if (CompareStackPointer) {
         // The primary scratch register is available for holding the TLS offset.
         ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true);
         SaveScratch2 = false;
       } else {
         // Need to use a second register to hold the TLS offset
         ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false);
 
         // Unfortunately, with fastcc the second scratch register may hold an
         // argument.
         SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
       }
 
       // If Scratch2 is live-in then it needs to be saved.
       assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
              "Scratch register is live-in and not saved");
 
       if (SaveScratch2)
         BuildMI(checkMBB, DL, TII.get(X86::PUSH32r))
           .addReg(ScratchReg2, RegState::Kill);
 
       BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2)
         .addImm(TlsOffset);
       BuildMI(checkMBB, DL, TII.get(X86::CMP32rm))
         .addReg(ScratchReg)
         .addReg(ScratchReg2).addImm(1).addReg(0)
         .addImm(0)
         .addReg(TlsReg);
 
       if (SaveScratch2)
         BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2);
     }
   }
 
   // This jump is taken if SP >= (Stacklet Limit + Stack Space required).
   // It jumps to normal execution of the function body.
   BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&prologueMBB);
 
   // On 32 bit we first push the arguments size and then the frame size. On 64
   // bit, we pass the stack frame size in r10 and the argument size in r11.
   if (Is64Bit) {
     // Functions with nested arguments use R10, so it needs to be saved across
     // the call to _morestack
 
     const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX;
     const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D;
     const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D;
     const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr;
     const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri;
 
     if (IsNested)
       BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10);
 
     BuildMI(allocMBB, DL, TII.get(MOVri), Reg10)
       .addImm(StackSize);
     BuildMI(allocMBB, DL, TII.get(MOVri), Reg11)
       .addImm(X86FI->getArgumentStackSize());
     MF.getRegInfo().setPhysRegUsed(Reg10);
     MF.getRegInfo().setPhysRegUsed(Reg11);
   } else {
     BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
       .addImm(X86FI->getArgumentStackSize());
     BuildMI(allocMBB, DL, TII.get(X86::PUSHi32))
       .addImm(StackSize);
   }
 
   // __morestack is in libgcc
   if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) {
     // Under the large code model, we cannot assume that __morestack lives
     // within 2^31 bytes of the call site, so we cannot use pc-relative
     // addressing. We cannot perform the call via a temporary register,
     // as the rax register may be used to store the static chain, and all
     // other suitable registers may be either callee-save or used for
     // parameter passing. We cannot use the stack at this point either
     // because __morestack manipulates the stack directly.
     //
     // To avoid these issues, perform an indirect call via a read-only memory
     // location containing the address.
     //
     // This solution is not perfect, as it assumes that the .rodata section
     // is laid out within 2^31 bytes of each function body, but this seems
     // to be sufficient for JIT.
     BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
         .addReg(X86::RIP)
         .addImm(0)
         .addReg(0)
         .addExternalSymbol("__morestack_addr")
         .addReg(0);
     MF.getMMI().setUsesMorestackAddr(true);
   } else {
     if (Is64Bit)
       BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32))
         .addExternalSymbol("__morestack");
     else
       BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32))
         .addExternalSymbol("__morestack");
   }
 
   if (IsNested)
     BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10));
   else
     BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET));
 
   allocMBB->addSuccessor(&prologueMBB);
 
   checkMBB->addSuccessor(allocMBB);
   checkMBB->addSuccessor(&prologueMBB);
 
 #ifdef XDEBUG
   MF.verify();
 #endif
 }
 
 /// Erlang programs may need a special prologue to handle the stack size they
 /// might need at runtime. That is because Erlang/OTP does not implement a C
 /// stack but uses a custom implementation of hybrid stack/heap architecture.
 /// (for more information see Eric Stenman's Ph.D. thesis:
 /// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
 ///
 /// CheckStack:
 ///       temp0 = sp - MaxStack
 ///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 /// OldStart:
 ///       ...
 /// IncStack:
 ///       call inc_stack   # doubles the stack space
 ///       temp0 = sp - MaxStack
 ///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const unsigned SlotSize =
       static_cast<const X86RegisterInfo *>(MF.getSubtarget().getRegisterInfo())
           ->getSlotSize();
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   const bool Is64Bit = STI.is64Bit();
   const bool IsLP64 = STI.isTarget64BitLP64();
   DebugLoc DL;
   // HiPE-specific values
   const unsigned HipeLeafWords = 24;
   const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
   const unsigned Guaranteed = HipeLeafWords * SlotSize;
   unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
                             MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
   unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
 
   assert(STI.isTargetLinux() &&
          "HiPE prologue is only supported on Linux operating systems.");
 
   // Compute the largest caller's frame that is needed to fit the callees'
   // frames. This 'MaxStack' is computed from:
   //
   // a) the fixed frame size, which is the space needed for all spilled temps,
   // b) outgoing on-stack parameter areas, and
   // c) the minimum stack space this function needs to make available for the
   //    functions it calls (a tunable ABI property).
   if (MFI->hasCalls()) {
     unsigned MoreStackForCalls = 0;
 
     for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
          MBBI != MBBE; ++MBBI)
       for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
            MI != ME; ++MI) {
         if (!MI->isCall())
           continue;
 
         // Get callee operand.
         const MachineOperand &MO = MI->getOperand(0);
 
         // Only take account of global function calls (no closures etc.).
         if (!MO.isGlobal())
           continue;
 
         const Function *F = dyn_cast<Function>(MO.getGlobal());
         if (!F)
           continue;
 
         // Do not update 'MaxStack' for primitive and built-in functions
         // (encoded with names either starting with "erlang."/"bif_" or not
         // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
         // "_", such as the BIF "suspend_0") as they are executed on another
         // stack.
         if (F->getName().find("erlang.") != StringRef::npos ||
             F->getName().find("bif_") != StringRef::npos ||
             F->getName().find_first_of("._") == StringRef::npos)
           continue;
 
         unsigned CalleeStkArity =
           F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
         if (HipeLeafWords - 1 > CalleeStkArity)
           MoreStackForCalls = std::max(MoreStackForCalls,
                                (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
       }
     MaxStack += MoreStackForCalls;
   }
 
   // If the stack frame needed is larger than the guaranteed then runtime checks
   // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
   if (MaxStack > Guaranteed) {
     MachineBasicBlock &prologueMBB = MF.front();
     MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
     MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
 
     for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(),
            E = prologueMBB.livein_end(); I != E; I++) {
       stackCheckMBB->addLiveIn(*I);
       incStackMBB->addLiveIn(*I);
     }
 
     MF.push_front(incStackMBB);
     MF.push_front(stackCheckMBB);
 
     unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
     unsigned LEAop, CMPop, CALLop;
     if (Is64Bit) {
       SPReg = X86::RSP;
       PReg  = X86::RBP;
       LEAop = X86::LEA64r;
       CMPop = X86::CMP64rm;
       CALLop = X86::CALL64pcrel32;
       SPLimitOffset = 0x90;
     } else {
       SPReg = X86::ESP;
       PReg  = X86::EBP;
       LEAop = X86::LEA32r;
       CMPop = X86::CMP32rm;
       CALLop = X86::CALLpcrel32;
       SPLimitOffset = 0x4c;
     }
 
     ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true);
     assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
            "HiPE prologue scratch register is live-in");
 
     // Create new MBB for StackCheck:
     addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
                  SPReg, false, -MaxStack);
     // SPLimitOffset is in a fixed heap location (pointed by BP).
     addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
                  .addReg(ScratchReg), PReg, false, SPLimitOffset);
     BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&prologueMBB);
 
     // Create new MBB for IncStack:
     BuildMI(incStackMBB, DL, TII.get(CALLop)).
       addExternalSymbol("inc_stack_0");
     addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
                  SPReg, false, -MaxStack);
     addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
                  .addReg(ScratchReg), PReg, false, SPLimitOffset);
     BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB);
 
     stackCheckMBB->addSuccessor(&prologueMBB, 99);
     stackCheckMBB->addSuccessor(incStackMBB, 1);
     incStackMBB->addSuccessor(&prologueMBB, 99);
     incStackMBB->addSuccessor(incStackMBB, 1);
   }
 #ifdef XDEBUG
   MF.verify();
 #endif
 }
 
-bool X86FrameLowering::
-convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator I, uint64_t Amount) const {
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
-  const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
-    MF.getSubtarget().getRegisterInfo());
-  unsigned StackPtr = RegInfo.getStackRegister();
-
-  // Scan the call setup sequence for the pattern we're looking for.
-  // We only handle a simple case now - a sequence of MOV32mi or MOV32mr
-  // instructions, that push a sequence of 32-bit values onto the stack, with
-  // no gaps.  
-  std::map<int64_t, MachineBasicBlock::iterator> MovMap;
-  do {
-    int Opcode = I->getOpcode();
-    if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr)
-      break;
- 
-    // We only want movs of the form:
-    // movl imm/r32, k(%ecx)
-    // If we run into something else, bail
-    // Note that AddrBaseReg may, counterintuitively, not be a register...
-    if (!I->getOperand(X86::AddrBaseReg).isReg() || 
-        (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) ||
-        !I->getOperand(X86::AddrScaleAmt).isImm() ||
-        (I->getOperand(X86::AddrScaleAmt).getImm() != 1) ||
-        (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) ||
-        (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) ||
-        !I->getOperand(X86::AddrDisp).isImm())
-      return false;
-
-    int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm();
-    
-    // We don't want to consider the unaligned case.
-    if (StackDisp % 4)
-      return false;
-
-    // If the same stack slot is being filled twice, something's fishy.
-    if (!MovMap.insert(std::pair<int64_t, MachineInstr*>(StackDisp, I)).second)
-      return false;
-
-    ++I;
-  } while (I != MBB.end());
-
-  // We now expect the end of the sequence - a call and a stack adjust.
-  if (I == MBB.end())
-    return false;
-  if (!I->isCall())
-    return false;
-  MachineBasicBlock::iterator Call = I;
-  if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode())
-    return false;
-
-  // Now, go through the map, and see that we don't have any gaps,
-  // but only a series of 32-bit MOVs.
-  // Since std::map provides ordered iteration, the original order
-  // of the MOVs doesn't matter.
-  int64_t ExpectedDist = 0;
-  for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; 
-       ++MMI, ExpectedDist += 4)
-    if (MMI->first != ExpectedDist)
-      return false;
-
-  // Ok, everything looks fine. Do the transformation.
-  DebugLoc DL = I->getDebugLoc();
-
-  // It's possible the original stack adjustment amount was larger than
-  // that done by the pushes. If so, we still need a SUB.
-  Amount -= ExpectedDist;
-  if (Amount) {
-    MachineInstr* Sub = BuildMI(MBB, Call, DL,
-                          TII.get(getSUBriOpcode(false, Amount)), StackPtr)
-                  .addReg(StackPtr).addImm(Amount);
-    Sub->getOperand(3).setIsDead();
-  }
-
-  // Now, iterate through the map in reverse order, and replace the movs
-  // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses.
-  for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) {
-    MachineBasicBlock::iterator MOV = MMI->second;
-    MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
-
-    // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size
-    int PushOpcode = X86::PUSH32r;
-    if (MOV->getOpcode() == X86::MOV32mi)
-      PushOpcode = getPUSHiOpcode(false, PushOp);
-
-    BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp);
-    MBB.erase(MOV);
-  }
-
-  return true;
-}
-
 void X86FrameLowering::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
                                        MF.getSubtarget().getRegisterInfo());
   unsigned StackPtr = RegInfo.getStackRegister();
   bool reserveCallFrame = hasReservedCallFrame(MF);
   int Opcode = I->getOpcode();
   bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
   const X86Subtarget &STI = MF.getTarget().getSubtarget<X86Subtarget>();
   bool IsLP64 = STI.isTarget64BitLP64();
   DebugLoc DL = I->getDebugLoc();
   uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0;
-  uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
+  uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0;
   I = MBB.erase(I);
 
   if (!reserveCallFrame) {
     // If the stack pointer can be changed after prologue, turn the
     // adjcallstackup instruction into a 'sub ESP, <amt>' and the
     // adjcallstackdown instruction into 'add ESP, <amt>'
     if (Amount == 0)
       return;
 
     // We need to keep the stack aligned properly.  To do this, we round the
     // amount of space needed for the outgoing arguments up to the next
     // alignment boundary.
     unsigned StackAlign = MF.getTarget()
                               .getSubtargetImpl()
                               ->getFrameLowering()
                               ->getStackAlignment();
     Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
 
     MachineInstr *New = nullptr;
-    if (Opcode == TII.getCallFrameSetupOpcode()) {
-      // Try to convert movs to the stack into pushes.
-      // We currently only look for a pattern that appears in 32-bit
-      // calling conventions.
-      if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount))
-        return;
-
-      New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
-                    StackPtr)
-        .addReg(StackPtr)
-        .addImm(Amount);
-    } else {
-      assert(Opcode == TII.getCallFrameDestroyOpcode());
 
-      // Factor out the amount the callee already popped.
-      Amount -= CalleeAmt;
+    // Factor out the amount that gets handled inside the sequence
+    // (Pushes of argument for frame setup, callee pops for frame destroy)
+    Amount -= InternalAmt;
+
+    if (Amount) {
+      if (Opcode == TII.getCallFrameSetupOpcode()) {
+        New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr)
+          .addReg(StackPtr).addImm(Amount);
+      } else {
+        assert(Opcode == TII.getCallFrameDestroyOpcode());
 
-      if (Amount) {
         unsigned Opc = getADDriOpcode(IsLP64, Amount);
         New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
           .addReg(StackPtr).addImm(Amount);
       }
     }
 
     if (New) {
       // The EFLAGS implicit def is dead.
       New->getOperand(3).setIsDead();
 
       // Replace the pseudo instruction with a new instruction.
       MBB.insert(I, New);
     }
 
     return;
   }
 
-  if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
+  if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) {
     // If we are performing frame pointer elimination and if the callee pops
     // something off the stack pointer, add it back.  We do this until we have
     // more advanced stack pointer tracking ability.
-    unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
+    unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt);
     MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
-      .addReg(StackPtr).addImm(CalleeAmt);
+      .addReg(StackPtr).addImm(InternalAmt);
 
     // The EFLAGS implicit def is dead.
     New->getOperand(3).setIsDead();
 
     // We are not tracking the stack pointer adjustment by the callee, so make
     // sure we restore the stack pointer immediately after the call, there may
     // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
     MachineBasicBlock::iterator B = MBB.begin();
     while (I != B && !std::prev(I)->isCall())
       --I;
     MBB.insert(I, New);
   }
 }
 
diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
index ee0ee227cad8..9cb887ac112d 100644
--- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h
@@ -1,93 +1,95 @@
 //===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This class implements X86-specific bits of TargetFrameLowering class.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
 #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
 
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
 
 class MCSymbol;
 class X86TargetMachine;
 class X86Subtarget;
 
 class X86FrameLowering : public TargetFrameLowering {
 public:
   explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO)
     : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {}
 
   static void getStackProbeFunction(const X86Subtarget &STI,
                                     unsigned &CallOp,
                                     const char *&Symbol);
 
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
                                  DebugLoc DL) const;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
   void emitPrologue(MachineFunction &MF) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void adjustForSegmentedStacks(MachineFunction &MF) const override;
 
   void adjustForHiPEPrologue(MachineFunction &MF) const override;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS = nullptr) const override;
 
   bool
   assignCalleeSavedSpillSlots(MachineFunction &MF,
                               const TargetRegisterInfo *TRI,
                               std::vector<CalleeSavedInfo> &CSI) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
                                  const TargetRegisterInfo *TRI) const override;
 
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator MI,
                                   const std::vector<CalleeSavedInfo> &CSI,
                                   const TargetRegisterInfo *TRI) const override;
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
+  bool needsFrameIndexResolution(const MachineFunction &MF) const override;
 
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
 
   int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const;
   int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI,
                                    unsigned &FrameReg) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                  MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI) const override;
 
 private:
   /// convertArgMovsToPushes - This method tries to convert a call sequence
   /// that uses sub and mov instructions to put the argument onto the stack
   /// into a series of pushes.
   /// Returns true if the transformation succeeded, false if not.
   bool convertArgMovsToPushes(MachineFunction &MF, 
                               MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I, 
                               uint64_t Amount) const;
 };
 
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
index ed0a6346929b..880c982982a1 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1,1848 +1,1852 @@
 //===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file describes the various pseudo instructions used by the compiler,
 // as well as Pat patterns used during instruction selection.
 //
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // Pattern Matching Support
 
 def GetLo32XForm : SDNodeXForm<imm, [{
   // Transformation function: get the low 32 bits.
   return getI32Imm((unsigned)N->getZExtValue());
 }]>;
 
 def GetLo8XForm : SDNodeXForm<imm, [{
   // Transformation function: get the low 8 bits.
   return getI8Imm((uint8_t)N->getZExtValue());
 }]>;
 
 
 //===----------------------------------------------------------------------===//
 // Random Pseudo Instructions.
 
 // PIC base construction.  This expands to code that looks like this:
 //     call  $next_inst
 //     popl %destreg"
 let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
   def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
                       "", []>;
 
 
 // ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
 // a stack adjustment and the codegen must know that they may modify the stack
 // pointer before prolog-epilog rewriting occurs.
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [ESP, EFLAGS], Uses = [ESP] in {
-def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKDOWN",
-                           [(X86callseq_start timm:$amt)]>,
+                           []>,
                           Requires<[NotLP64]>;
 def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[NotLP64]>;
 }
+def : Pat<(X86callseq_start timm:$amt1),
+          (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>;
+
 
 // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
 // a stack adjustment and the codegen must know that they may modify the stack
 // pointer before prolog-epilog rewriting occurs.
 // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
 // sub / add which can clobber EFLAGS.
 let Defs = [RSP, EFLAGS], Uses = [RSP] in {
-def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKDOWN",
-                           [(X86callseq_start timm:$amt)]>,
+                           []>,
                           Requires<[IsLP64]>;
 def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
                            "#ADJCALLSTACKUP",
                            [(X86callseq_end timm:$amt1, timm:$amt2)]>,
                           Requires<[IsLP64]>;
 }
-
+def : Pat<(X86callseq_start timm:$amt1),
+          (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>;
 
 
 // x86-64 va_start lowering magic.
 let usesCustomInserter = 1, Defs = [EFLAGS] in {
 def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
                               (outs),
                               (ins GR8:$al,
                                    i64imm:$regsavefi, i64imm:$offset,
                                    variable_ops),
                               "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
                               [(X86vastart_save_xmm_regs GR8:$al,
                                                          imm:$regsavefi,
                                                          imm:$offset),
                                (implicit EFLAGS)]>;
 
 // The VAARG_64 pseudo-instruction takes the address of the va_list,
 // and places the address of the next argument into a register.
 let Defs = [EFLAGS] in
 def VAARG_64 : I<0, Pseudo,
                  (outs GR64:$dst),
                  (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
                  "#VAARG_64 $dst, $ap, $size, $mode, $align",
                  [(set GR64:$dst,
                     (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
                   (implicit EFLAGS)]>;
 
 // Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
 // targets.  These calls are needed to probe the stack when allocating more than
 // 4k bytes in one go. Touching the stack at 4K increments is necessary to
 // ensure that the guard pages used by the OS virtual memory manager are
 // allocated in correct sequence.
 // The main point of having separate instruction are extra unmodelled effects
 // (compared to ordinary calls) like stack pointer change.
 
 let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
   def WIN_ALLOCA : I<0, Pseudo, (outs), (ins),
                      "# dynamic stack allocation",
                      [(X86WinAlloca)]>;
 
 // When using segmented stacks these are lowered into instructions which first
 // check if the current stacklet has enough free memory. If it does, memory is
 // allocated by bumping the stack pointer. Otherwise memory is allocated from
 // the heap.
 
 let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
 def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
                       "# variable sized alloca for segmented stacks",
                       [(set GR32:$dst,
                          (X86SegAlloca GR32:$size))]>,
                     Requires<[NotLP64]>;
 
 let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
 def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
                       "# variable sized alloca for segmented stacks",
                       [(set GR64:$dst,
                          (X86SegAlloca GR64:$size))]>,
                     Requires<[In64BitMode]>;
 }
 
 // The MSVC runtime contains an _ftol2 routine for converting floating-point
 // to integer values. It has a strange calling convention: the input is
 // popped from the x87 stack, and the return value is given in EDX:EAX. ECX is
 // used as a temporary register. No other registers (aside from flags) are
 // touched.
 // Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
 // variant is unnecessary.
 
 let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in {
   def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
                       "# win32 fptoui",
                       [(X86WinFTOL RFP32:$src)]>,
                     Requires<[Not64BitMode]>;
 
   def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src),
                       "# win32 fptoui",
                       [(X86WinFTOL RFP64:$src)]>,
                     Requires<[Not64BitMode]>;
 }
 
 //===----------------------------------------------------------------------===//
 // EH Pseudo Instructions
 //
 let SchedRW = [WriteSystem] in {
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, isCodeGenOnly = 1 in {
 def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
                     "ret\t#eh_return, addr: $addr",
                     [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
 
 }
 
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, isCodeGenOnly = 1 in {
 def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
                      "ret\t#eh_return, addr: $addr",
                      [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>;
 
 }
 
 let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
     usesCustomInserter = 1 in {
   def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
                             "#EH_SJLJ_SETJMP32",
                             [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
                           Requires<[Not64BitMode]>;
   def EH_SjLj_SetJmp64  : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
                             "#EH_SJLJ_SETJMP64",
                             [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
                           Requires<[In64BitMode]>;
   let isTerminator = 1 in {
   def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
                             "#EH_SJLJ_LONGJMP32",
                             [(X86eh_sjlj_longjmp addr:$buf)]>,
                           Requires<[Not64BitMode]>;
   def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
                             "#EH_SJLJ_LONGJMP64",
                             [(X86eh_sjlj_longjmp addr:$buf)]>,
                           Requires<[In64BitMode]>;
   }
 }
 } // SchedRW
 
 let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
   def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
                         "#EH_SjLj_Setup\t$dst", []>;
 }
 
 //===----------------------------------------------------------------------===//
 // Pseudo instructions used by unwind info.
 //
 let isPseudo = 1 in {
   def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
                             "#SEH_PushReg $reg", []>;
   def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
                             "#SEH_SaveReg $reg, $dst", []>;
   def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
                             "#SEH_SaveXMM $reg, $dst", []>;
   def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size),
                             "#SEH_StackAlloc $size", []>;
   def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset),
                             "#SEH_SetFrame $reg, $offset", []>;
   def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode),
                             "#SEH_PushFrame $mode", []>;
   def SEH_EndPrologue : I<0, Pseudo, (outs), (ins),
                             "#SEH_EndPrologue", []>;
   def SEH_Epilogue : I<0, Pseudo, (outs), (ins),
                             "#SEH_Epilogue", []>;
 }
 
 //===----------------------------------------------------------------------===//
 // Pseudo instructions used by segmented stacks.
 //
 
 // This is lowered into a RET instruction by MCInstLower.  We need
 // this so that we don't have to have a MachineBasicBlock which ends
 // with a RET and also has successors.
 let isPseudo = 1 in {
 def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
                           "", []>;
 
 // This instruction is lowered to a RET followed by a MOV.  The two
 // instructions are not generated on a higher level since then the
 // verifier sees a MachineBasicBlock ending with a non-terminator.
 def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
                                   "", []>;
 }
 
 //===----------------------------------------------------------------------===//
 // Alias Instructions
 //===----------------------------------------------------------------------===//
 
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
     isPseudo = 1 in
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
 
 // Other widths can also make use of the 32-bit xor, which may have a smaller
 // encoding and avoid partial register updates.
 def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
 def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
 def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
   let AddedComplexity = 20;
 }
 
 // Materialize i64 constant where top 32-bits are zero. This could theoretically
 // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
 // that would make it more difficult to rematerialize.
 let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
     isCodeGenOnly = 1, hasSideEffects = 0 in
 def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
                      "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
 
 // This 64-bit pseudo-move can be used for both a 64-bit constant that is
 // actually the zero-extension of a 32-bit constant, and for labels in the
 // x86-64 small code model.
 def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>;
 
 let AddedComplexity = 1 in
 def : Pat<(i64 mov64imm32:$src),
           (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
 
 // Use sbb to materialize carry bit.
 let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
 // FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
 // However, Pat<> can't replicate the destination reg into the inputs of the
 // result.
 def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
                  [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
 def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
                  [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
 def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
 def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
                  [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
 } // isCodeGenOnly
 
 
 def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C16r)>;
 def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C32r)>;
 def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C64r)>;
 
 def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C16r)>;
 def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C32r)>;
 def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C64r)>;
 
 // We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
 // will be eliminated and that the sbb can be extended up to a wider type.  When
 // this happens, it is great.  However, if we are left with an 8-bit sbb and an
 // and, we might as well just match it as a setb.
 def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
           (SETBr)>;
 
 // (add OP, SETB) -> (adc OP, 0)
 def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op),
           (ADC8ri GR8:$op, 0)>;
 def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op),
           (ADC32ri8 GR32:$op, 0)>;
 def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op),
           (ADC64ri8 GR64:$op, 0)>;
 
 // (sub OP, SETB) -> (sbb OP, 0)
 def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
           (SBB8ri GR8:$op, 0)>;
 def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
           (SBB32ri8 GR32:$op, 0)>;
 def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)),
           (SBB64ri8 GR64:$op, 0)>;
 
 // (sub OP, SETCC_CARRY) -> (adc OP, 0)
 def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))),
           (ADC8ri GR8:$op, 0)>;
 def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))),
           (ADC32ri8 GR32:$op, 0)>;
 def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))),
           (ADC64ri8 GR64:$op, 0)>;
 
 //===----------------------------------------------------------------------===//
 // String Pseudo Instructions
 //
 let SchedRW = [WriteMicrocoded] in {
 let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
 def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
                     [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
                    Requires<[Not64BitMode]>;
 def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
                     [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
                    Requires<[Not64BitMode]>;
 def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
                     [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
                    Requires<[Not64BitMode]>;
 }
 
 let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in {
 def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
                     [(X86rep_movs i8)], IIC_REP_MOVS>, REP,
                    Requires<[In64BitMode]>;
 def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
                     [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16,
                    Requires<[In64BitMode]>;
 def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
                     [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32,
                    Requires<[In64BitMode]>;
 def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
                     [(X86rep_movs i64)], IIC_REP_MOVS>, REP,
                    Requires<[In64BitMode]>;
 }
 
 // FIXME: Should use "(X86rep_stos AL)" as the pattern.
 let Defs = [ECX,EDI], isCodeGenOnly = 1 in {
   let Uses = [AL,ECX,EDI] in
   def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
                       [(X86rep_stos i8)], IIC_REP_STOS>, REP,
                      Requires<[Not64BitMode]>;
   let Uses = [AX,ECX,EDI] in
   def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
                       [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
                      Requires<[Not64BitMode]>;
   let Uses = [EAX,ECX,EDI] in
   def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
                       [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
                      Requires<[Not64BitMode]>;
 }
 
 let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
   let Uses = [AL,RCX,RDI] in
   def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
                       [(X86rep_stos i8)], IIC_REP_STOS>, REP,
                      Requires<[In64BitMode]>;
   let Uses = [AX,RCX,RDI] in
   def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
                       [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16,
                      Requires<[In64BitMode]>;
   let Uses = [RAX,RCX,RDI] in
   def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
                       [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32,
                      Requires<[In64BitMode]>;
 
   let Uses = [RAX,RCX,RDI] in
   def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
                       [(X86rep_stos i64)], IIC_REP_STOS>, REP,
                      Requires<[In64BitMode]>;
 }
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
 // Thread Local Storage Instructions
 //
 
 // ELF TLS Support
 // All calls clobber the non-callee saved registers. ESP is marked as
 // a use to prevent stack-pointer assignments that appear immediately
 // before calls from potentially appearing dead.
 let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
             ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
     Uses = [ESP] in {
 def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "# TLS_addr32",
                   [(X86tlsaddr tls32addr:$sym)]>,
                   Requires<[Not64BitMode]>;
 def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "# TLS_base_addr32",
                   [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
                   Requires<[Not64BitMode]>;
 }
 
 // All calls clobber the non-callee saved registers. RSP is marked as
 // a use to prevent stack-pointer assignments that appear immediately
 // before calls from potentially appearing dead.
 let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
             FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
             ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7,
             MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
             XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
             XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
     Uses = [RSP] in {
 def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                    "# TLS_addr64",
                   [(X86tlsaddr tls64addr:$sym)]>,
                   Requires<[In64BitMode]>;
 def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                    "# TLS_base_addr64",
                   [(X86tlsbaseaddr tls64baseaddr:$sym)]>,
                   Requires<[In64BitMode]>;
 }
 
 // Darwin TLS Support
 // For i386, the address of the thunk is passed on the stack, on return the
 // address of the variable is in %eax.  %ecx is trashed during the function
 // call.  All other registers are preserved.
 let Defs = [EAX, ECX, EFLAGS],
     Uses = [ESP],
     usesCustomInserter = 1 in
 def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                 "# TLSCall_32",
                 [(X86TLSCall addr:$sym)]>,
                 Requires<[Not64BitMode]>;
 
 // For x86_64, the address of the thunk is passed in %rdi, on return
 // the address of the variable is in %rax.  All other registers are preserved.
 let Defs = [RAX, EFLAGS],
     Uses = [RSP, RDI],
     usesCustomInserter = 1 in
 def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                   "# TLSCall_64",
                   [(X86TLSCall addr:$sym)]>,
                   Requires<[In64BitMode]>;
 
 
 //===----------------------------------------------------------------------===//
 // Conditional Move Pseudo Instructions
 
 // X86 doesn't have 8-bit conditional moves. Use a customInserter to
 // emit control flow. An alternative to this is to mark i8 SELECT as Promote,
 // however that requires promoting the operands, and can induce additional
 // i8 register pressure.
 let usesCustomInserter = 1, Uses = [EFLAGS] in {
 def CMOV_GR8 : I<0, Pseudo,
                  (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
                  "#CMOV_GR8 PSEUDO!",
                  [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2,
                                           imm:$cond, EFLAGS))]>;
 
 let Predicates = [NoCMov] in {
 def CMOV_GR32 : I<0, Pseudo,
                     (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond),
                     "#CMOV_GR32* PSEUDO!",
                     [(set GR32:$dst,
                       (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>;
 def CMOV_GR16 : I<0, Pseudo,
                     (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond),
                     "#CMOV_GR16* PSEUDO!",
                     [(set GR16:$dst,
                       (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
 } // Predicates = [NoCMov]
 
 // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
 // SSE1.
 let Predicates = [FPStackf32] in
 def CMOV_RFP32 : I<0, Pseudo,
                     (outs RFP32:$dst),
                     (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
                     "#CMOV_RFP32 PSEUDO!",
                     [(set RFP32:$dst,
                       (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
                                                   EFLAGS))]>;
 // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
 // SSE2.
 let Predicates = [FPStackf64] in
 def CMOV_RFP64 : I<0, Pseudo,
                     (outs RFP64:$dst),
                     (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
                     "#CMOV_RFP64 PSEUDO!",
                     [(set RFP64:$dst,
                       (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond,
                                                   EFLAGS))]>;
 def CMOV_RFP80 : I<0, Pseudo,
                     (outs RFP80:$dst),
                     (ins RFP80:$src1, RFP80:$src2, i8imm:$cond),
                     "#CMOV_RFP80 PSEUDO!",
                     [(set RFP80:$dst,
                       (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
                                                   EFLAGS))]>;
 } // UsesCustomInserter = 1, Uses = [EFLAGS]
 
 
 //===----------------------------------------------------------------------===//
 // Normal-Instructions-With-Lock-Prefix Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
 // FIXME: Use normal instructions and add lock prefix dynamically.
 
 // Memory barriers
 
 // TODO: Get this to fold the constant into the instruction.
 let isCodeGenOnly = 1, Defs = [EFLAGS] in
 def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
                       "or{l}\t{$zero, $dst|$dst, $zero}",
                       [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK,
                     Sched<[WriteALULd, WriteRMW]>;
 
 let hasSideEffects = 1 in
 def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
                      "#MEMBARRIER",
                      [(X86MemBarrier)]>, Sched<[WriteLoad]>;
 
 // RegOpc corresponds to the mr version of the instruction
 // ImmOpc corresponds to the mi version of the instruction
 // ImmOpc8 corresponds to the mi8 version of the instruction
 // ImmMod corresponds to the instruction format of the mi and mi8 versions
 multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
                            Format ImmMod, string mnemonic> {
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
     SchedRW = [WriteALULd, WriteRMW] in {
 
 def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                   RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
                   MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
                   !strconcat(mnemonic, "{b}\t",
                              "{$src2, $dst|$dst, $src2}"),
                   [], IIC_ALU_NONMEM>, LOCK;
 def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                    MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                    !strconcat(mnemonic, "{w}\t",
                               "{$src2, $dst|$dst, $src2}"),
                    [], IIC_ALU_NONMEM>, OpSize16, LOCK;
 def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                    MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                    !strconcat(mnemonic, "{l}\t",
                               "{$src2, $dst|$dst, $src2}"),
                    [], IIC_ALU_NONMEM>, OpSize32, LOCK;
 def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                     MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     !strconcat(mnemonic, "{q}\t",
                                "{$src2, $dst|$dst, $src2}"),
                     [], IIC_ALU_NONMEM>, LOCK;
 
 def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                     ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
                     ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
                     !strconcat(mnemonic, "{b}\t",
                                "{$src2, $dst|$dst, $src2}"),
                     [], IIC_ALU_MEM>, LOCK;
 
 def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                       ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
                       !strconcat(mnemonic, "{w}\t",
                                  "{$src2, $dst|$dst, $src2}"),
                       [], IIC_ALU_MEM>, OpSize16, LOCK;
 
 def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                       ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                       ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
                       !strconcat(mnemonic, "{l}\t",
                                  "{$src2, $dst|$dst, $src2}"),
                       [], IIC_ALU_MEM>, OpSize32, LOCK;
 
 def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                           ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                           ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
                           !strconcat(mnemonic, "{q}\t",
                                      "{$src2, $dst|$dst, $src2}"),
                           [], IIC_ALU_MEM>, LOCK;
 
 def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                       ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
                       !strconcat(mnemonic, "{w}\t",
                                  "{$src2, $dst|$dst, $src2}"),
                       [], IIC_ALU_MEM>, OpSize16, LOCK;
 def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                       ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                       ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
                       !strconcat(mnemonic, "{l}\t",
                                  "{$src2, $dst|$dst, $src2}"),
                       [], IIC_ALU_MEM>, OpSize32, LOCK;
 def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
                        !strconcat(mnemonic, "{q}\t",
                                   "{$src2, $dst|$dst, $src2}"),
                        [], IIC_ALU_MEM>, LOCK;
 
 }
 
 }
 
 defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">;
 defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">;
 defm LOCK_OR  : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">;
 defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
 defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
 
 // Optimized codegen when the non-memory output is not used.
 multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
                           string mnemonic> {
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
     SchedRW = [WriteALULd, WriteRMW] in {
 
 def NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
                  !strconcat(mnemonic, "{b}\t$dst"),
                  [], IIC_UNARY_MEM>, LOCK;
 def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
                  !strconcat(mnemonic, "{w}\t$dst"),
                  [], IIC_UNARY_MEM>, OpSize16, LOCK;
 def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
                  !strconcat(mnemonic, "{l}\t$dst"),
                  [], IIC_UNARY_MEM>, OpSize32, LOCK;
 def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
                   !strconcat(mnemonic, "{q}\t$dst"),
                   [], IIC_UNARY_MEM>, LOCK;
 }
 }
 
 defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
 defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
 
 // Atomic compare and swap.
 multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
                          SDPatternOperator frag, X86MemOperand x86memop,
                          InstrItinClass itin> {
 let isCodeGenOnly = 1 in {
   def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
                !strconcat(mnemonic, "\t$ptr"),
                [(frag addr:$ptr)], itin>, TB, LOCK;
 }
 }
 
 multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
                           string mnemonic, SDPatternOperator frag,
                           InstrItinClass itin8, InstrItinClass itin> {
 let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in {
   let Defs = [AL, EFLAGS], Uses = [AL] in
   def NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
                   !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
                   [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
   let Defs = [AX, EFLAGS], Uses = [AX] in
   def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
                   !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
                   [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize16, LOCK;
   let Defs = [EAX, EFLAGS], Uses = [EAX] in
   def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
                   !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
                   [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, OpSize32, LOCK;
   let Defs = [RAX, EFLAGS], Uses = [RAX] in
   def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
                    !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
                    [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
 }
 }
 
 let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
     SchedRW = [WriteALULd, WriteRMW] in {
 defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
                                 X86cas8, i64mem,
                                 IIC_CMPX_LOCK_8B>;
 }
 
 let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
     Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in {
 defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
                                  X86cas16, i128mem,
                                  IIC_CMPX_LOCK_16B>, REX_W;
 }
 
 defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
                                X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
 
 // Atomic exchange and add
 multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
                              string frag,
                              InstrItinClass itin8, InstrItinClass itin> {
   let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1,
       SchedRW = [WriteALULd, WriteRMW] in {
     def NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst),
                     (ins GR8:$val, i8mem:$ptr),
                     !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
                     [(set GR8:$dst,
                           (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
                     itin8>;
     def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
                     (ins GR16:$val, i16mem:$ptr),
                     !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
                     [(set
                        GR16:$dst,
                        (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
                     itin>, OpSize16;
     def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
                     (ins GR32:$val, i32mem:$ptr),
                     !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
                     [(set
                        GR32:$dst,
                        (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
                     itin>, OpSize32;
     def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
                      (ins GR64:$val, i64mem:$ptr),
                      !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
                      [(set
                         GR64:$dst,
                         (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
                      itin>;
   }
 }
 
 defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
                                IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
              TB, LOCK;
 
 /* The following multiclass tries to make sure that in code like
  *    x.store (immediate op x.load(acquire), release)
  * an operation directly on memory is generated instead of wasting a register.
  * It is not automatic as atomic_store/load are only lowered to MOV instructions
  * extremely late to prevent them from being accidentally reordered in the backend
  * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions)
  */
 multiclass RELEASE_BINOP_MI<string op> {
     def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
         "#RELEASE_BINOP PSEUDO!",
         [(atomic_store_8 addr:$dst, (!cast<PatFrag>(op)
             (atomic_load_8 addr:$dst), (i8 imm:$src)))]>;
     // NAME#16 is not generated as 16-bit arithmetic instructions are considered
     // costly and avoided as far as possible by this backend anyway
     def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
         "#RELEASE_BINOP PSEUDO!",
         [(atomic_store_32 addr:$dst, (!cast<PatFrag>(op)
             (atomic_load_32 addr:$dst), (i32 imm:$src)))]>;
     def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
         "#RELEASE_BINOP PSEUDO!",
         [(atomic_store_64 addr:$dst, (!cast<PatFrag>(op)
             (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>;
 }
 defm RELEASE_ADD : RELEASE_BINOP_MI<"add">;
 defm RELEASE_AND : RELEASE_BINOP_MI<"and">;
 defm RELEASE_OR  : RELEASE_BINOP_MI<"or">;
 defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">;
 // Note: we don't deal with sub, because substractions of constants are
 // optimized into additions before this code can run
 
 multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
     def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst),
         "#RELEASE_UNOP PSEUDO!",
         [(atomic_store_8 addr:$dst, dag8)]>;
     def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst),
         "#RELEASE_UNOP PSEUDO!",
         [(atomic_store_16 addr:$dst, dag16)]>;
     def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst),
         "#RELEASE_UNOP PSEUDO!",
         [(atomic_store_32 addr:$dst, dag32)]>;
     def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst),
         "#RELEASE_UNOP PSEUDO!",
         [(atomic_store_64 addr:$dst, dag64)]>;
 }
 
 defm RELEASE_INC : RELEASE_UNOP<
     (add (atomic_load_8  addr:$dst), (i8 1)),
     (add (atomic_load_16 addr:$dst), (i16 1)),
     (add (atomic_load_32 addr:$dst), (i32 1)),
     (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
 defm RELEASE_DEC : RELEASE_UNOP<
     (add (atomic_load_8  addr:$dst), (i8 -1)),
     (add (atomic_load_16 addr:$dst), (i16 -1)),
     (add (atomic_load_32 addr:$dst), (i32 -1)),
     (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
 /*
 TODO: These don't work because the type inference of TableGen fails.
 TODO: find a way to fix it.
 defm RELEASE_NEG : RELEASE_UNOP<
     (ineg (atomic_load_8  addr:$dst)),
     (ineg (atomic_load_16 addr:$dst)),
     (ineg (atomic_load_32 addr:$dst)),
     (ineg (atomic_load_64 addr:$dst))>;
 defm RELEASE_NOT : RELEASE_UNOP<
     (not (atomic_load_8  addr:$dst)),
     (not (atomic_load_16 addr:$dst)),
     (not (atomic_load_32 addr:$dst)),
     (not (atomic_load_64 addr:$dst))>;
 */
 
 def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
 			"#RELEASE_MOV PSEUDO !",
 			[(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
 def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
 			"#RELEASE_MOV PSEUDO !",
 			[(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
 def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
 			"#RELEASE_MOV PSEUDO !",
 			[(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
 def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
 			"#RELEASE_MOV PSEUDO !",
 			[(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
 
 def RELEASE_MOV8mr  : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
                         "#RELEASE_MOV PSEUDO!",
                         [(atomic_store_8  addr:$dst, GR8 :$src)]>;
 def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src),
                         "#RELEASE_MOV PSEUDO!",
                         [(atomic_store_16 addr:$dst, GR16:$src)]>;
 def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src),
                         "#RELEASE_MOV PSEUDO!",
                         [(atomic_store_32 addr:$dst, GR32:$src)]>;
 def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src),
                         "#RELEASE_MOV PSEUDO!",
                         [(atomic_store_64 addr:$dst, GR64:$src)]>;
 
 def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
                       "#ACQUIRE_MOV PSEUDO!",
                       [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
 def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src),
                       "#ACQUIRE_MOV PSEUDO!",
                       [(set GR16:$dst, (atomic_load_16 addr:$src))]>;
 def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
                       "#ACQUIRE_MOV PSEUDO!",
                       [(set GR32:$dst, (atomic_load_32 addr:$src))]>;
 def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
                       "#ACQUIRE_MOV PSEUDO!",
                       [(set GR64:$dst, (atomic_load_64 addr:$src))]>;
 //===----------------------------------------------------------------------===//
 // Conditional Move Pseudo Instructions.
 //===----------------------------------------------------------------------===//
 
 // CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded after
 // instruction selection into a branch sequence.
 let Uses = [EFLAGS], usesCustomInserter = 1 in {
   def CMOV_FR32 : I<0, Pseudo,
                     (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
                     "#CMOV_FR32 PSEUDO!",
                     [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
                                                   EFLAGS))]>;
   def CMOV_FR64 : I<0, Pseudo,
                     (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
                     "#CMOV_FR64 PSEUDO!",
                     [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
                                                   EFLAGS))]>;
   def CMOV_V4F32 : I<0, Pseudo,
                     (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
                     "#CMOV_V4F32 PSEUDO!",
                     [(set VR128:$dst,
                       (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
                                           EFLAGS)))]>;
   def CMOV_V2F64 : I<0, Pseudo,
                     (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
                     "#CMOV_V2F64 PSEUDO!",
                     [(set VR128:$dst,
                       (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
                                           EFLAGS)))]>;
   def CMOV_V2I64 : I<0, Pseudo,
                     (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
                     "#CMOV_V2I64 PSEUDO!",
                     [(set VR128:$dst,
                       (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
                                           EFLAGS)))]>;
   def CMOV_V8F32 : I<0, Pseudo,
                     (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
                     "#CMOV_V8F32 PSEUDO!",
                     [(set VR256:$dst,
                       (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond,
                                           EFLAGS)))]>;
   def CMOV_V4F64 : I<0, Pseudo,
                     (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
                     "#CMOV_V4F64 PSEUDO!",
                     [(set VR256:$dst,
                       (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
                                           EFLAGS)))]>;
   def CMOV_V4I64 : I<0, Pseudo,
                     (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond),
                     "#CMOV_V4I64 PSEUDO!",
                     [(set VR256:$dst,
                       (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
                                           EFLAGS)))]>;
   def CMOV_V8I64 : I<0, Pseudo,
                     (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
                     "#CMOV_V8I64 PSEUDO!",
                     [(set VR512:$dst,
                       (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
                                           EFLAGS)))]>;
   def CMOV_V8F64 : I<0, Pseudo,
                     (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
                     "#CMOV_V8F64 PSEUDO!",
                     [(set VR512:$dst,
                       (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
                                           EFLAGS)))]>;
   def CMOV_V16F32 : I<0, Pseudo,
                     (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
                     "#CMOV_V16F32 PSEUDO!",
                     [(set VR512:$dst,
                       (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond,
                                           EFLAGS)))]>;
 }
 
 
 //===----------------------------------------------------------------------===//
 // DAG Pattern Matching Rules
 //===----------------------------------------------------------------------===//
 
 // ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
 def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
 def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
 def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
 def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
 def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
 def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
 
 def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
           (ADD32ri GR32:$src1, tconstpool:$src2)>;
 def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
           (ADD32ri GR32:$src1, tjumptable:$src2)>;
 def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
           (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
 def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
           (ADD32ri GR32:$src1, texternalsym:$src2)>;
 def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
           (ADD32ri GR32:$src1, tblockaddress:$src2)>;
 
 def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
           (MOV32mi addr:$dst, tglobaladdr:$src)>;
 def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
           (MOV32mi addr:$dst, texternalsym:$src)>;
 def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
           (MOV32mi addr:$dst, tblockaddress:$src)>;
 
 // ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
 // code model mode, should use 'movabs'.  FIXME: This is really a hack, the
 //  'movabs' predicate should handle this sort of thing.
 def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
           (MOV64ri tconstpool  :$dst)>, Requires<[FarData]>;
 def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
           (MOV64ri tjumptable  :$dst)>, Requires<[FarData]>;
 def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
           (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
 def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
           (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
 def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
           (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
 
 // In kernel code model, we can get the address of a label
 // into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of
 // the MOV64ri32 should accept these.
 def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
           (MOV64ri32 tconstpool  :$dst)>, Requires<[KernelCode]>;
 def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
           (MOV64ri32 tjumptable  :$dst)>, Requires<[KernelCode]>;
 def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
           (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
 def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
           (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
 def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
           (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
 
 // If we have small model and -static mode, it is safe to store global addresses
 // directly as immediates.  FIXME: This is really a hack, the 'imm' predicate
 // for MOV64mi32 should handle this sort of thing.
 def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tconstpool:$src)>,
           Requires<[NearData, IsStatic]>;
 def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tjumptable:$src)>,
           Requires<[NearData, IsStatic]>;
 def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
           Requires<[NearData, IsStatic]>;
 def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, texternalsym:$src)>,
           Requires<[NearData, IsStatic]>;
 def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tblockaddress:$src)>,
           Requires<[NearData, IsStatic]>;
 
 def : Pat<(i32 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
 def : Pat<(i64 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV64ri texternalsym:$dst)>;
 
 // Calls
 
 // tls has some funny stuff here...
 // This corresponds to movabs $foo@tpoff, %rax
 def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
           (MOV64ri32 tglobaltlsaddr :$dst)>;
 // This corresponds to add $foo@tpoff, %rax
 def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
           (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
 
 
 // Direct PC relative function call for small code model. 32-bit displacement
 // sign extended to 64-bit.
 def : Pat<(X86call (i64 tglobaladdr:$dst)),
           (CALL64pcrel32 tglobaladdr:$dst)>;
 def : Pat<(X86call (i64 texternalsym:$dst)),
           (CALL64pcrel32 texternalsym:$dst)>;
 
 // Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
 // can never use callee-saved registers. That is the purpose of the GR64_TC
 // register classes.
 //
 // The only volatile register that is never used by the calling convention is
 // %r11. This happens when calling a vararg function with 6 arguments.
 //
 // Match an X86tcret that uses less than 7 volatile registers.
 def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
                              (X86tcret node:$ptr, node:$off), [{
   // X86tcret args: (*chain, ptr, imm, regs..., glue)
   unsigned NumRegs = 0;
   for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
     if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
       return false;
   return true;
 }]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
           Requires<[Not64BitMode]>;
 
 // FIXME: This is disabled for 32-bit PIC mode because the global base
 // register which is part of the address mode may be assigned a
 // callee-saved register.
 def : Pat<(X86tcret (load addr:$dst), imm:$off),
           (TCRETURNmi addr:$dst, imm:$off)>,
           Requires<[Not64BitMode, IsNotPIC]>;
 
 def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi texternalsym:$dst, imm:$off)>,
           Requires<[Not64BitMode]>;
 
 def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
           (TCRETURNdi texternalsym:$dst, imm:$off)>,
           Requires<[Not64BitMode]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
           Requires<[In64BitMode]>;
 
 // Don't fold loads into X86tcret requiring more than 6 regs.
 // There wouldn't be enough scratch registers for base+index.
 def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
           (TCRETURNmi64 addr:$dst, imm:$off)>,
           Requires<[In64BitMode]>;
 
 def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
           Requires<[In64BitMode]>;
 
 def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
           (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
           Requires<[In64BitMode]>;
 
 // Normal calls, with various flavors of addresses.
 def : Pat<(X86call (i32 tglobaladdr:$dst)),
           (CALLpcrel32 tglobaladdr:$dst)>;
 def : Pat<(X86call (i32 texternalsym:$dst)),
           (CALLpcrel32 texternalsym:$dst)>;
 def : Pat<(X86call (i32 imm:$dst)),
           (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
 
 // Comparisons.
 
 // TEST R,R is smaller than CMP R,0
 def : Pat<(X86cmp GR8:$src1, 0),
           (TEST8rr GR8:$src1, GR8:$src1)>;
 def : Pat<(X86cmp GR16:$src1, 0),
           (TEST16rr GR16:$src1, GR16:$src1)>;
 def : Pat<(X86cmp GR32:$src1, 0),
           (TEST32rr GR32:$src1, GR32:$src1)>;
 def : Pat<(X86cmp GR64:$src1, 0),
           (TEST64rr GR64:$src1, GR64:$src1)>;
 
 // Conditional moves with folded loads with operands swapped and conditions
 // inverted.
 multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
                   Instruction Inst64> {
   let Predicates = [HasCMov] in {
     def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
               (Inst16 GR16:$src2, addr:$src1)>;
     def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
               (Inst32 GR32:$src2, addr:$src1)>;
     def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
               (Inst64 GR64:$src2, addr:$src1)>;
   }
 }
 
 defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
 defm : CMOVmr<X86_COND_AE, CMOVB16rm , CMOVB32rm , CMOVB64rm>;
 defm : CMOVmr<X86_COND_E , CMOVNE16rm, CMOVNE32rm, CMOVNE64rm>;
 defm : CMOVmr<X86_COND_NE, CMOVE16rm , CMOVE32rm , CMOVE64rm>;
 defm : CMOVmr<X86_COND_BE, CMOVA16rm , CMOVA32rm , CMOVA64rm>;
 defm : CMOVmr<X86_COND_A , CMOVBE16rm, CMOVBE32rm, CMOVBE64rm>;
 defm : CMOVmr<X86_COND_L , CMOVGE16rm, CMOVGE32rm, CMOVGE64rm>;
 defm : CMOVmr<X86_COND_GE, CMOVL16rm , CMOVL32rm , CMOVL64rm>;
 defm : CMOVmr<X86_COND_LE, CMOVG16rm , CMOVG32rm , CMOVG64rm>;
 defm : CMOVmr<X86_COND_G , CMOVLE16rm, CMOVLE32rm, CMOVLE64rm>;
 defm : CMOVmr<X86_COND_P , CMOVNP16rm, CMOVNP32rm, CMOVNP64rm>;
 defm : CMOVmr<X86_COND_NP, CMOVP16rm , CMOVP32rm , CMOVP64rm>;
 defm : CMOVmr<X86_COND_S , CMOVNS16rm, CMOVNS32rm, CMOVNS64rm>;
 defm : CMOVmr<X86_COND_NS, CMOVS16rm , CMOVS32rm , CMOVS64rm>;
 defm : CMOVmr<X86_COND_O , CMOVNO16rm, CMOVNO32rm, CMOVNO64rm>;
 defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
 
 // zextload bool -> zextload byte
 def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
 def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
 def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
 def : Pat<(zextloadi64i1 addr:$src),
           (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 
 // extload bool -> extload byte
 // When extloading from 16-bit and smaller memory locations into 64-bit
 // registers, use zero-extending loads so that the entire 64-bit register is
 // defined, avoiding partial-register updates.
 
 def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
 def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>;
 def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
 def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
 def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
 def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
 
 // For other extloads, use subregs, since the high contents of the register are
 // defined after an extload.
 def : Pat<(extloadi64i1 addr:$src),
           (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 def : Pat<(extloadi64i8 addr:$src),
           (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 def : Pat<(extloadi64i16 addr:$src),
           (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
 def : Pat<(extloadi64i32 addr:$src),
           (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
 
 // anyext. Define these to do an explicit zero-extend to
 // avoid partial-register updates.
 def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG
                                      (MOVZX32rr8 GR8 :$src), sub_16bit)>;
 def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
 
 // Except for i16 -> i32 since isel expect i16 ops to be promoted to i32.
 def : Pat<(i32 (anyext GR16:$src)),
           (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
 
 def : Pat<(i64 (anyext GR8 :$src)),
           (SUBREG_TO_REG (i64 0), (MOVZX32rr8  GR8  :$src), sub_32bit)>;
 def : Pat<(i64 (anyext GR16:$src)),
           (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
 def : Pat<(i64 (anyext GR32:$src)),
           (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
 
 
 // Any instruction that defines a 32-bit result leaves the high half of the
 // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
 // be copying from a truncate. And x86's cmov doesn't do anything if the
 // condition is false. But any other 32-bit operation will zero-extend
 // up to 64 bits.
 def def32 : PatLeaf<(i32 GR32:$src), [{
   return N->getOpcode() != ISD::TRUNCATE &&
          N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
          N->getOpcode() != ISD::CopyFromReg &&
          N->getOpcode() != ISD::AssertSext &&
          N->getOpcode() != X86ISD::CMOV;
 }]>;
 
 // In the case of a 32-bit def that is known to implicitly zero-extend,
 // we can use a SUBREG_TO_REG.
 def : Pat<(i64 (zext def32:$src)),
           (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
 
 //===----------------------------------------------------------------------===//
 // Pattern match OR as ADD
 //===----------------------------------------------------------------------===//
 
 // If safe, we prefer to pattern match OR as ADD at isel time. ADD can be
 // 3-addressified into an LEA instruction to avoid copies.  However, we also
 // want to finally emit these instructions as an or at the end of the code
 // generator to make the generated code easier to read.  To do this, we select
 // into "disjoint bits" pseudo ops.
 
 // Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
 def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
 
   APInt KnownZero0, KnownOne0;
   CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
   APInt KnownZero1, KnownOne1;
   CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
   return (~KnownZero0 & ~KnownZero1) == 0;
 }]>;
 
 
 // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
 // Try this before the selecting to OR.
 let AddedComplexity = 5, SchedRW = [WriteALU] in {
 
 let isConvertibleToThreeAddress = 1,
     Constraints = "$src1 = $dst", Defs = [EFLAGS] in {
 let isCommutable = 1 in {
 def ADD16rr_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
                     "", // orw/addw REG, REG
                     [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>;
 def ADD32rr_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
                     "", // orl/addl REG, REG
                     [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>;
 def ADD64rr_DB  : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
                     "", // orq/addq REG, REG
                     [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>;
 } // isCommutable
 
 // NOTE: These are order specific, we want the ri8 forms to be listed
 // first so that they are slightly preferred to the ri forms.
 
 def ADD16ri8_DB : I<0, Pseudo,
                     (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
                     "", // orw/addw REG, imm8
                     [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>;
 def ADD16ri_DB  : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
                     "", // orw/addw REG, imm
                     [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>;
 
 def ADD32ri8_DB : I<0, Pseudo,
                     (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
                     "", // orl/addl REG, imm8
                     [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>;
 def ADD32ri_DB  : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
                     "", // orl/addl REG, imm
                     [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>;
 
 
 def ADD64ri8_DB : I<0, Pseudo,
                     (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
                     "", // orq/addq REG, imm8
                     [(set GR64:$dst, (or_is_add GR64:$src1,
                                                 i64immSExt8:$src2))]>;
 def ADD64ri32_DB : I<0, Pseudo,
                      (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
                       "", // orq/addq REG, imm
                       [(set GR64:$dst, (or_is_add GR64:$src1,
                                                   i64immSExt32:$src2))]>;
 }
 } // AddedComplexity, SchedRW
 
 
 //===----------------------------------------------------------------------===//
 // Some peepholes
 //===----------------------------------------------------------------------===//
 
 // Odd encoding trick: -128 fits into an 8-bit immediate field while
 // +128 doesn't, so in this special case use a sub instead of an add.
 def : Pat<(add GR16:$src1, 128),
           (SUB16ri8 GR16:$src1, -128)>;
 def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
           (SUB16mi8 addr:$dst, -128)>;
 
 def : Pat<(add GR32:$src1, 128),
           (SUB32ri8 GR32:$src1, -128)>;
 def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
           (SUB32mi8 addr:$dst, -128)>;
 
 def : Pat<(add GR64:$src1, 128),
           (SUB64ri8 GR64:$src1, -128)>;
 def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
           (SUB64mi8 addr:$dst, -128)>;
 
 // The same trick applies for 32-bit immediate fields in 64-bit
 // instructions.
 def : Pat<(add GR64:$src1, 0x0000000080000000),
           (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
 def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
           (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
 
 // To avoid needing to materialize an immediate in a register, use a 32-bit and
 // with implicit zero-extension instead of a 64-bit and if the immediate has at
 // least 32 bits of leading zeros. If in addition the last 32 bits can be
 // represented with a sign extension of a 8 bit constant, use that.
 
 def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm),
           (SUBREG_TO_REG
             (i64 0),
             (AND32ri8
               (EXTRACT_SUBREG GR64:$src, sub_32bit),
               (i32 (GetLo8XForm imm:$imm))),
             sub_32bit)>;
 
 def : Pat<(and GR64:$src, i64immZExt32:$imm),
           (SUBREG_TO_REG
             (i64 0),
             (AND32ri
               (EXTRACT_SUBREG GR64:$src, sub_32bit),
               (i32 (GetLo32XForm imm:$imm))),
             sub_32bit)>;
 
 
 // r & (2^16-1) ==> movz
 def : Pat<(and GR32:$src1, 0xffff),
           (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR32:$src1, 0xff),
           (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
                                                              GR32_ABCD)),
                                       sub_8bit))>,
       Requires<[Not64BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
            (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
             (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
              sub_16bit)>,
       Requires<[Not64BitMode]>;
 
 // r & (2^32-1) ==> movz
 def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
           (SUBREG_TO_REG (i64 0),
                          (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
                          sub_32bit)>;
 // r & (2^16-1) ==> movz
 def : Pat<(and GR64:$src, 0xffff),
           (SUBREG_TO_REG (i64 0),
                       (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
                       sub_32bit)>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR64:$src, 0xff),
           (SUBREG_TO_REG (i64 0),
                          (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
                          sub_32bit)>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR32:$src1, 0xff),
            (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
       Requires<[In64BitMode]>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR16:$src1, 0xff),
            (EXTRACT_SUBREG (MOVZX32rr8 (i8
             (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
       Requires<[In64BitMode]>;
 
 
 // sext_inreg patterns
 def : Pat<(sext_inreg GR32:$src, i16),
           (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
 def : Pat<(sext_inreg GR32:$src, i8),
           (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
                                                              GR32_ABCD)),
                                       sub_8bit))>,
       Requires<[Not64BitMode]>;
 
 def : Pat<(sext_inreg GR16:$src, i8),
            (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
             (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
              sub_16bit)>,
       Requires<[Not64BitMode]>;
 
 def : Pat<(sext_inreg GR64:$src, i32),
           (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
 def : Pat<(sext_inreg GR64:$src, i16),
           (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
 def : Pat<(sext_inreg GR64:$src, i8),
           (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
 def : Pat<(sext_inreg GR32:$src, i8),
           (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
       Requires<[In64BitMode]>;
 def : Pat<(sext_inreg GR16:$src, i8),
            (EXTRACT_SUBREG (MOVSX32rr8
             (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
       Requires<[In64BitMode]>;
 
 // sext, sext_load, zext, zext_load
 def: Pat<(i16 (sext GR8:$src)),
           (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>;
 def: Pat<(sextloadi16i8 addr:$src),
           (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>;
 def: Pat<(i16 (zext GR8:$src)),
           (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>;
 def: Pat<(zextloadi16i8 addr:$src),
           (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>;
 
 // trunc patterns
 def : Pat<(i16 (trunc GR32:$src)),
           (EXTRACT_SUBREG GR32:$src, sub_16bit)>;
 def : Pat<(i8 (trunc GR32:$src)),
           (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
                           sub_8bit)>,
       Requires<[Not64BitMode]>;
 def : Pat<(i8 (trunc GR16:$src)),
           (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                           sub_8bit)>,
       Requires<[Not64BitMode]>;
 def : Pat<(i32 (trunc GR64:$src)),
           (EXTRACT_SUBREG GR64:$src, sub_32bit)>;
 def : Pat<(i16 (trunc GR64:$src)),
           (EXTRACT_SUBREG GR64:$src, sub_16bit)>;
 def : Pat<(i8 (trunc GR64:$src)),
           (EXTRACT_SUBREG GR64:$src, sub_8bit)>;
 def : Pat<(i8 (trunc GR32:$src)),
           (EXTRACT_SUBREG GR32:$src, sub_8bit)>,
       Requires<[In64BitMode]>;
 def : Pat<(i8 (trunc GR16:$src)),
           (EXTRACT_SUBREG GR16:$src, sub_8bit)>,
       Requires<[In64BitMode]>;
 
 // h-register tricks
 def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
           (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                           sub_8bit_hi)>,
       Requires<[Not64BitMode]>;
 def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
           (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
                           sub_8bit_hi)>,
       Requires<[Not64BitMode]>;
 def : Pat<(srl GR16:$src, (i8 8)),
           (EXTRACT_SUBREG
             (MOVZX32rr8
               (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                               sub_8bit_hi)),
             sub_16bit)>,
       Requires<[Not64BitMode]>;
 def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
           (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
                                                              GR16_ABCD)),
                                       sub_8bit_hi))>,
       Requires<[Not64BitMode]>;
 def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
           (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
                                                              GR16_ABCD)),
                                       sub_8bit_hi))>,
       Requires<[Not64BitMode]>;
 def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
           (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
                                                              GR32_ABCD)),
                                       sub_8bit_hi))>,
       Requires<[Not64BitMode]>;
 def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
           (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
                                                              GR32_ABCD)),
                                       sub_8bit_hi))>,
       Requires<[Not64BitMode]>;
 
 // h-register tricks.
 // For now, be conservative on x86-64 and use an h-register extract only if the
 // value is immediately zero-extended or stored, which are somewhat common
 // cases. This uses a bunch of code to prevent a register requiring a REX prefix
 // from being allocated in the same instruction as the h register, as there's
 // currently no way to describe this requirement to the register allocator.
 
 // h-register extract and zero-extend.
 def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
           (SUBREG_TO_REG
             (i64 0),
             (MOVZX32_NOREXrr8
               (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
                               sub_8bit_hi)),
             sub_32bit)>;
 def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
           (MOVZX32_NOREXrr8
             (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
                             sub_8bit_hi))>,
       Requires<[In64BitMode]>;
 def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
           (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
                                                                    GR32_ABCD)),
                                              sub_8bit_hi))>,
       Requires<[In64BitMode]>;
 def : Pat<(srl GR16:$src, (i8 8)),
           (EXTRACT_SUBREG
             (MOVZX32_NOREXrr8
               (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                               sub_8bit_hi)),
             sub_16bit)>,
       Requires<[In64BitMode]>;
 def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
           (MOVZX32_NOREXrr8
             (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                             sub_8bit_hi))>,
       Requires<[In64BitMode]>;
 def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
           (MOVZX32_NOREXrr8
             (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                             sub_8bit_hi))>,
       Requires<[In64BitMode]>;
 def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
           (SUBREG_TO_REG
             (i64 0),
             (MOVZX32_NOREXrr8
               (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                               sub_8bit_hi)),
             sub_32bit)>;
 def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
           (SUBREG_TO_REG
             (i64 0),
             (MOVZX32_NOREXrr8
               (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                               sub_8bit_hi)),
             sub_32bit)>;
 
 // h-register extract and store.
 def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
           (MOV8mr_NOREX
             addr:$dst,
             (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
                             sub_8bit_hi))>;
 def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
           (MOV8mr_NOREX
             addr:$dst,
             (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
                             sub_8bit_hi))>,
       Requires<[In64BitMode]>;
 def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
           (MOV8mr_NOREX
             addr:$dst,
             (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                             sub_8bit_hi))>,
       Requires<[In64BitMode]>;
 
 
 // (shl x, 1) ==> (add x, x)
 // Note that if x is undef (immediate or otherwise), we could theoretically
 // end up with the two uses of x getting different values, producing a result
 // where the least significant bit is not 0. However, the probability of this
 // happening is considered low enough that this is officially not a
 // "real problem".
 def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
 def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
 def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
 def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
 
 // Helper imms that check if a mask doesn't change significant shift bits.
 def immShift32 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 5; }]>;
 def immShift64 : ImmLeaf<i8, [{ return CountTrailingOnes_32(Imm) >= 6; }]>;
 
 // Shift amount is implicitly masked.
 multiclass MaskedShiftAmountPats<SDNode frag, string name> {
   // (shift x (and y, 31)) ==> (shift x, y)
   def : Pat<(frag GR8:$src1, (and CL, immShift32)),
             (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
   def : Pat<(frag GR16:$src1, (and CL, immShift32)),
             (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
   def : Pat<(frag GR32:$src1, (and CL, immShift32)),
             (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
   def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst),
             (!cast<Instruction>(name # "8mCL") addr:$dst)>;
   def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst),
             (!cast<Instruction>(name # "16mCL") addr:$dst)>;
   def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
             (!cast<Instruction>(name # "32mCL") addr:$dst)>;
 
   // (shift x (and y, 63)) ==> (shift x, y)
   def : Pat<(frag GR64:$src1, (and CL, immShift64)),
             (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
   def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
             (!cast<Instruction>(name # "64mCL") addr:$dst)>;
 }
 
 defm : MaskedShiftAmountPats<shl, "SHL">;
 defm : MaskedShiftAmountPats<srl, "SHR">;
 defm : MaskedShiftAmountPats<sra, "SAR">;
 defm : MaskedShiftAmountPats<rotl, "ROL">;
 defm : MaskedShiftAmountPats<rotr, "ROR">;
 
 // (anyext (setcc_carry)) -> (setcc_carry)
 def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C16r)>;
 def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C32r)>;
 def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
           (SETB_C32r)>;
 
 
 
 
 //===----------------------------------------------------------------------===//
 // EFLAGS-defining Patterns
 //===----------------------------------------------------------------------===//
 
 // add reg, reg
 def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr  GR8 :$src1, GR8 :$src2)>;
 def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>;
 def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>;
 
 // add reg, mem
 def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
           (ADD8rm GR8:$src1, addr:$src2)>;
 def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
           (ADD16rm GR16:$src1, addr:$src2)>;
 def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
           (ADD32rm GR32:$src1, addr:$src2)>;
 
 // add reg, imm
 def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri  GR8:$src1 , imm:$src2)>;
 def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>;
 def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>;
 def : Pat<(add GR16:$src1, i16immSExt8:$src2),
           (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
 def : Pat<(add GR32:$src1, i32immSExt8:$src2),
           (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
 // sub reg, reg
 def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr  GR8 :$src1, GR8 :$src2)>;
 def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>;
 def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>;
 
 // sub reg, mem
 def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
           (SUB8rm GR8:$src1, addr:$src2)>;
 def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
           (SUB16rm GR16:$src1, addr:$src2)>;
 def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
           (SUB32rm GR32:$src1, addr:$src2)>;
 
 // sub reg, imm
 def : Pat<(sub GR8:$src1, imm:$src2),
           (SUB8ri GR8:$src1, imm:$src2)>;
 def : Pat<(sub GR16:$src1, imm:$src2),
           (SUB16ri GR16:$src1, imm:$src2)>;
 def : Pat<(sub GR32:$src1, imm:$src2),
           (SUB32ri GR32:$src1, imm:$src2)>;
 def : Pat<(sub GR16:$src1, i16immSExt8:$src2),
           (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
 def : Pat<(sub GR32:$src1, i32immSExt8:$src2),
           (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
 
 // sub 0, reg
 def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r  GR8 :$src)>;
 def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
 def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
 def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
 
 // mul reg, reg
 def : Pat<(mul GR16:$src1, GR16:$src2),
           (IMUL16rr GR16:$src1, GR16:$src2)>;
 def : Pat<(mul GR32:$src1, GR32:$src2),
           (IMUL32rr GR32:$src1, GR32:$src2)>;
 
 // mul reg, mem
 def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
           (IMUL16rm GR16:$src1, addr:$src2)>;
 def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
           (IMUL32rm GR32:$src1, addr:$src2)>;
 
 // mul reg, imm
 def : Pat<(mul GR16:$src1, imm:$src2),
           (IMUL16rri GR16:$src1, imm:$src2)>;
 def : Pat<(mul GR32:$src1, imm:$src2),
           (IMUL32rri GR32:$src1, imm:$src2)>;
 def : Pat<(mul GR16:$src1, i16immSExt8:$src2),
           (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
 def : Pat<(mul GR32:$src1, i32immSExt8:$src2),
           (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
 
 // reg = mul mem, imm
 def : Pat<(mul (loadi16 addr:$src1), imm:$src2),
           (IMUL16rmi addr:$src1, imm:$src2)>;
 def : Pat<(mul (loadi32 addr:$src1), imm:$src2),
           (IMUL32rmi addr:$src1, imm:$src2)>;
 def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2),
           (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
 def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2),
           (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
 
 // Patterns for nodes that do not produce flags, for instructions that do.
 
 // addition
 def : Pat<(add GR64:$src1, GR64:$src2),
           (ADD64rr GR64:$src1, GR64:$src2)>;
 def : Pat<(add GR64:$src1, i64immSExt8:$src2),
           (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
 def : Pat<(add GR64:$src1, i64immSExt32:$src2),
           (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
 def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
           (ADD64rm GR64:$src1, addr:$src2)>;
 
 // subtraction
 def : Pat<(sub GR64:$src1, GR64:$src2),
           (SUB64rr GR64:$src1, GR64:$src2)>;
 def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
           (SUB64rm GR64:$src1, addr:$src2)>;
 def : Pat<(sub GR64:$src1, i64immSExt8:$src2),
           (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
 def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
           (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
 
 // Multiply
 def : Pat<(mul GR64:$src1, GR64:$src2),
           (IMUL64rr GR64:$src1, GR64:$src2)>;
 def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
           (IMUL64rm GR64:$src1, addr:$src2)>;
 def : Pat<(mul GR64:$src1, i64immSExt8:$src2),
           (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
 def : Pat<(mul GR64:$src1, i64immSExt32:$src2),
           (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
 def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2),
           (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
 def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
           (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
 
 // Increment/Decrement reg.
 // Do not make INC/DEC if it is slow
 let Predicates = [NotSlowIncDec] in {
   def : Pat<(add GR8:$src, 1),   (INC8r GR8:$src)>;
   def : Pat<(add GR16:$src, 1),  (INC16r GR16:$src)>;
   def : Pat<(add GR32:$src, 1),  (INC32r GR32:$src)>;
   def : Pat<(add GR64:$src, 1),  (INC64r GR64:$src)>;
   def : Pat<(add GR8:$src, -1),  (DEC8r GR8:$src)>;
   def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>;
   def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>;
   def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>;
 }
 
 // or reg/reg.
 def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr  GR8 :$src1, GR8 :$src2)>;
 def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>;
 def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>;
 def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>;
 
 // or reg/mem
 def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
           (OR8rm GR8:$src1, addr:$src2)>;
 def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
           (OR16rm GR16:$src1, addr:$src2)>;
 def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
           (OR32rm GR32:$src1, addr:$src2)>;
 def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
           (OR64rm GR64:$src1, addr:$src2)>;
 
 // or reg/imm
 def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri  GR8 :$src1, imm:$src2)>;
 def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>;
 def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>;
 def : Pat<(or GR16:$src1, i16immSExt8:$src2),
           (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
 def : Pat<(or GR32:$src1, i32immSExt8:$src2),
           (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
 def : Pat<(or GR64:$src1, i64immSExt8:$src2),
           (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
 def : Pat<(or GR64:$src1, i64immSExt32:$src2),
           (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
 
 // xor reg/reg
 def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr  GR8 :$src1, GR8 :$src2)>;
 def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>;
 def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>;
 def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>;
 
 // xor reg/mem
 def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
           (XOR8rm GR8:$src1, addr:$src2)>;
 def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
           (XOR16rm GR16:$src1, addr:$src2)>;
 def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
           (XOR32rm GR32:$src1, addr:$src2)>;
 def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
           (XOR64rm GR64:$src1, addr:$src2)>;
 
 // xor reg/imm
 def : Pat<(xor GR8:$src1, imm:$src2),
           (XOR8ri GR8:$src1, imm:$src2)>;
 def : Pat<(xor GR16:$src1, imm:$src2),
           (XOR16ri GR16:$src1, imm:$src2)>;
 def : Pat<(xor GR32:$src1, imm:$src2),
           (XOR32ri GR32:$src1, imm:$src2)>;
 def : Pat<(xor GR16:$src1, i16immSExt8:$src2),
           (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
 def : Pat<(xor GR32:$src1, i32immSExt8:$src2),
           (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
 def : Pat<(xor GR64:$src1, i64immSExt8:$src2),
           (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
 def : Pat<(xor GR64:$src1, i64immSExt32:$src2),
           (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
 
 // and reg/reg
 def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr  GR8 :$src1, GR8 :$src2)>;
 def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>;
 def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>;
 def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>;
 
 // and reg/mem
 def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
           (AND8rm GR8:$src1, addr:$src2)>;
 def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
           (AND16rm GR16:$src1, addr:$src2)>;
 def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
           (AND32rm GR32:$src1, addr:$src2)>;
 def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
           (AND64rm GR64:$src1, addr:$src2)>;
 
 // and reg/imm
 def : Pat<(and GR8:$src1, imm:$src2),
           (AND8ri GR8:$src1, imm:$src2)>;
 def : Pat<(and GR16:$src1, imm:$src2),
           (AND16ri GR16:$src1, imm:$src2)>;
 def : Pat<(and GR32:$src1, imm:$src2),
           (AND32ri GR32:$src1, imm:$src2)>;
 def : Pat<(and GR16:$src1, i16immSExt8:$src2),
           (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
 def : Pat<(and GR32:$src1, i32immSExt8:$src2),
           (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
 def : Pat<(and GR64:$src1, i64immSExt8:$src2),
           (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
 def : Pat<(and GR64:$src1, i64immSExt32:$src2),
           (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
 
 // Bit scan instruction patterns to match explicit zero-undef behavior.
 def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>;
 def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>;
 def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>;
 def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>;
 def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>;
 def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>;
 
 // When HasMOVBE is enabled it is possible to get a non-legalized
 // register-register 16 bit bswap. This maps it to a ROL instruction.
 let Predicates = [HasMOVBE] in {
  def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>;
 }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
index 461569345a11..6b6b8aedc9c6 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -1,5922 +1,5974 @@
 //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains the X86 implementation of the TargetInstrInfo class.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86InstrInfo.h"
 #include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 #include <limits>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-instr-info"
 
 #define GET_INSTRINFO_CTOR_DTOR
 #include "X86GenInstrInfo.inc"
 
 static cl::opt<bool>
 NoFusing("disable-spill-fusing",
          cl::desc("Disable fusing of spill code into instructions"));
 static cl::opt<bool>
 PrintFailedFusing("print-failed-fuse-candidates",
                   cl::desc("Print instructions that the allocator wants to"
                            " fuse, but the X86 backend currently can't"),
                   cl::Hidden);
 static cl::opt<bool>
 ReMatPICStubLoad("remat-pic-stub-load",
                  cl::desc("Re-materialize load from stub in PIC mode"),
                  cl::init(false), cl::Hidden);
 
 enum {
   // Select which memory operand is being unfolded.
   // (stored in bits 0 - 3)
   TB_INDEX_0    = 0,
   TB_INDEX_1    = 1,
   TB_INDEX_2    = 2,
   TB_INDEX_3    = 3,
   TB_INDEX_4    = 4,
   TB_INDEX_MASK = 0xf,
 
   // Do not insert the reverse map (MemOp -> RegOp) into the table.
   // This may be needed because there is a many -> one mapping.
   TB_NO_REVERSE   = 1 << 4,
 
   // Do not insert the forward map (RegOp -> MemOp) into the table.
   // This is needed for Native Client, which prohibits branch
   // instructions from using a memory operand.
   TB_NO_FORWARD   = 1 << 5,
 
   TB_FOLDED_LOAD  = 1 << 6,
   TB_FOLDED_STORE = 1 << 7,
 
   // Minimum alignment required for load/store.
   // Used for RegOp->MemOp conversion.
   // (stored in bits 8 - 15)
   TB_ALIGN_SHIFT = 8,
   TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
   TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
   TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
   TB_ALIGN_64    =   64 << TB_ALIGN_SHIFT,
   TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
 };
 
 struct X86OpTblEntry {
   uint16_t RegOp;
   uint16_t MemOp;
   uint16_t Flags;
 };
 
 // Pin the vtable to this file.
 void X86InstrInfo::anchor() {}
 
 X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     : X86GenInstrInfo(
           (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32),
           (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)),
       Subtarget(STI), RI(STI) {
 
   static const X86OpTblEntry OpTbl2Addr[] = {
     { X86::ADC32ri,     X86::ADC32mi,    0 },
     { X86::ADC32ri8,    X86::ADC32mi8,   0 },
     { X86::ADC32rr,     X86::ADC32mr,    0 },
     { X86::ADC64ri32,   X86::ADC64mi32,  0 },
     { X86::ADC64ri8,    X86::ADC64mi8,   0 },
     { X86::ADC64rr,     X86::ADC64mr,    0 },
     { X86::ADD16ri,     X86::ADD16mi,    0 },
     { X86::ADD16ri8,    X86::ADD16mi8,   0 },
     { X86::ADD16ri_DB,  X86::ADD16mi,    TB_NO_REVERSE },
     { X86::ADD16ri8_DB, X86::ADD16mi8,   TB_NO_REVERSE },
     { X86::ADD16rr,     X86::ADD16mr,    0 },
     { X86::ADD16rr_DB,  X86::ADD16mr,    TB_NO_REVERSE },
     { X86::ADD32ri,     X86::ADD32mi,    0 },
     { X86::ADD32ri8,    X86::ADD32mi8,   0 },
     { X86::ADD32ri_DB,  X86::ADD32mi,    TB_NO_REVERSE },
     { X86::ADD32ri8_DB, X86::ADD32mi8,   TB_NO_REVERSE },
     { X86::ADD32rr,     X86::ADD32mr,    0 },
     { X86::ADD32rr_DB,  X86::ADD32mr,    TB_NO_REVERSE },
     { X86::ADD64ri32,   X86::ADD64mi32,  0 },
     { X86::ADD64ri8,    X86::ADD64mi8,   0 },
     { X86::ADD64ri32_DB,X86::ADD64mi32,  TB_NO_REVERSE },
     { X86::ADD64ri8_DB, X86::ADD64mi8,   TB_NO_REVERSE },
     { X86::ADD64rr,     X86::ADD64mr,    0 },
     { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
     { X86::ADD8ri,      X86::ADD8mi,     0 },
     { X86::ADD8rr,      X86::ADD8mr,     0 },
     { X86::AND16ri,     X86::AND16mi,    0 },
     { X86::AND16ri8,    X86::AND16mi8,   0 },
     { X86::AND16rr,     X86::AND16mr,    0 },
     { X86::AND32ri,     X86::AND32mi,    0 },
     { X86::AND32ri8,    X86::AND32mi8,   0 },
     { X86::AND32rr,     X86::AND32mr,    0 },
     { X86::AND64ri32,   X86::AND64mi32,  0 },
     { X86::AND64ri8,    X86::AND64mi8,   0 },
     { X86::AND64rr,     X86::AND64mr,    0 },
     { X86::AND8ri,      X86::AND8mi,     0 },
     { X86::AND8rr,      X86::AND8mr,     0 },
     { X86::DEC16r,      X86::DEC16m,     0 },
     { X86::DEC32r,      X86::DEC32m,     0 },
     { X86::DEC64r,      X86::DEC64m,     0 },
     { X86::DEC8r,       X86::DEC8m,      0 },
     { X86::INC16r,      X86::INC16m,     0 },
     { X86::INC32r,      X86::INC32m,     0 },
     { X86::INC64r,      X86::INC64m,     0 },
     { X86::INC8r,       X86::INC8m,      0 },
     { X86::NEG16r,      X86::NEG16m,     0 },
     { X86::NEG32r,      X86::NEG32m,     0 },
     { X86::NEG64r,      X86::NEG64m,     0 },
     { X86::NEG8r,       X86::NEG8m,      0 },
     { X86::NOT16r,      X86::NOT16m,     0 },
     { X86::NOT32r,      X86::NOT32m,     0 },
     { X86::NOT64r,      X86::NOT64m,     0 },
     { X86::NOT8r,       X86::NOT8m,      0 },
     { X86::OR16ri,      X86::OR16mi,     0 },
     { X86::OR16ri8,     X86::OR16mi8,    0 },
     { X86::OR16rr,      X86::OR16mr,     0 },
     { X86::OR32ri,      X86::OR32mi,     0 },
     { X86::OR32ri8,     X86::OR32mi8,    0 },
     { X86::OR32rr,      X86::OR32mr,     0 },
     { X86::OR64ri32,    X86::OR64mi32,   0 },
     { X86::OR64ri8,     X86::OR64mi8,    0 },
     { X86::OR64rr,      X86::OR64mr,     0 },
     { X86::OR8ri,       X86::OR8mi,      0 },
     { X86::OR8rr,       X86::OR8mr,      0 },
     { X86::ROL16r1,     X86::ROL16m1,    0 },
     { X86::ROL16rCL,    X86::ROL16mCL,   0 },
     { X86::ROL16ri,     X86::ROL16mi,    0 },
     { X86::ROL32r1,     X86::ROL32m1,    0 },
     { X86::ROL32rCL,    X86::ROL32mCL,   0 },
     { X86::ROL32ri,     X86::ROL32mi,    0 },
     { X86::ROL64r1,     X86::ROL64m1,    0 },
     { X86::ROL64rCL,    X86::ROL64mCL,   0 },
     { X86::ROL64ri,     X86::ROL64mi,    0 },
     { X86::ROL8r1,      X86::ROL8m1,     0 },
     { X86::ROL8rCL,     X86::ROL8mCL,    0 },
     { X86::ROL8ri,      X86::ROL8mi,     0 },
     { X86::ROR16r1,     X86::ROR16m1,    0 },
     { X86::ROR16rCL,    X86::ROR16mCL,   0 },
     { X86::ROR16ri,     X86::ROR16mi,    0 },
     { X86::ROR32r1,     X86::ROR32m1,    0 },
     { X86::ROR32rCL,    X86::ROR32mCL,   0 },
     { X86::ROR32ri,     X86::ROR32mi,    0 },
     { X86::ROR64r1,     X86::ROR64m1,    0 },
     { X86::ROR64rCL,    X86::ROR64mCL,   0 },
     { X86::ROR64ri,     X86::ROR64mi,    0 },
     { X86::ROR8r1,      X86::ROR8m1,     0 },
     { X86::ROR8rCL,     X86::ROR8mCL,    0 },
     { X86::ROR8ri,      X86::ROR8mi,     0 },
     { X86::SAR16r1,     X86::SAR16m1,    0 },
     { X86::SAR16rCL,    X86::SAR16mCL,   0 },
     { X86::SAR16ri,     X86::SAR16mi,    0 },
     { X86::SAR32r1,     X86::SAR32m1,    0 },
     { X86::SAR32rCL,    X86::SAR32mCL,   0 },
     { X86::SAR32ri,     X86::SAR32mi,    0 },
     { X86::SAR64r1,     X86::SAR64m1,    0 },
     { X86::SAR64rCL,    X86::SAR64mCL,   0 },
     { X86::SAR64ri,     X86::SAR64mi,    0 },
     { X86::SAR8r1,      X86::SAR8m1,     0 },
     { X86::SAR8rCL,     X86::SAR8mCL,    0 },
     { X86::SAR8ri,      X86::SAR8mi,     0 },
     { X86::SBB32ri,     X86::SBB32mi,    0 },
     { X86::SBB32ri8,    X86::SBB32mi8,   0 },
     { X86::SBB32rr,     X86::SBB32mr,    0 },
     { X86::SBB64ri32,   X86::SBB64mi32,  0 },
     { X86::SBB64ri8,    X86::SBB64mi8,   0 },
     { X86::SBB64rr,     X86::SBB64mr,    0 },
     { X86::SHL16rCL,    X86::SHL16mCL,   0 },
     { X86::SHL16ri,     X86::SHL16mi,    0 },
     { X86::SHL32rCL,    X86::SHL32mCL,   0 },
     { X86::SHL32ri,     X86::SHL32mi,    0 },
     { X86::SHL64rCL,    X86::SHL64mCL,   0 },
     { X86::SHL64ri,     X86::SHL64mi,    0 },
     { X86::SHL8rCL,     X86::SHL8mCL,    0 },
     { X86::SHL8ri,      X86::SHL8mi,     0 },
     { X86::SHLD16rrCL,  X86::SHLD16mrCL, 0 },
     { X86::SHLD16rri8,  X86::SHLD16mri8, 0 },
     { X86::SHLD32rrCL,  X86::SHLD32mrCL, 0 },
     { X86::SHLD32rri8,  X86::SHLD32mri8, 0 },
     { X86::SHLD64rrCL,  X86::SHLD64mrCL, 0 },
     { X86::SHLD64rri8,  X86::SHLD64mri8, 0 },
     { X86::SHR16r1,     X86::SHR16m1,    0 },
     { X86::SHR16rCL,    X86::SHR16mCL,   0 },
     { X86::SHR16ri,     X86::SHR16mi,    0 },
     { X86::SHR32r1,     X86::SHR32m1,    0 },
     { X86::SHR32rCL,    X86::SHR32mCL,   0 },
     { X86::SHR32ri,     X86::SHR32mi,    0 },
     { X86::SHR64r1,     X86::SHR64m1,    0 },
     { X86::SHR64rCL,    X86::SHR64mCL,   0 },
     { X86::SHR64ri,     X86::SHR64mi,    0 },
     { X86::SHR8r1,      X86::SHR8m1,     0 },
     { X86::SHR8rCL,     X86::SHR8mCL,    0 },
     { X86::SHR8ri,      X86::SHR8mi,     0 },
     { X86::SHRD16rrCL,  X86::SHRD16mrCL, 0 },
     { X86::SHRD16rri8,  X86::SHRD16mri8, 0 },
     { X86::SHRD32rrCL,  X86::SHRD32mrCL, 0 },
     { X86::SHRD32rri8,  X86::SHRD32mri8, 0 },
     { X86::SHRD64rrCL,  X86::SHRD64mrCL, 0 },
     { X86::SHRD64rri8,  X86::SHRD64mri8, 0 },
     { X86::SUB16ri,     X86::SUB16mi,    0 },
     { X86::SUB16ri8,    X86::SUB16mi8,   0 },
     { X86::SUB16rr,     X86::SUB16mr,    0 },
     { X86::SUB32ri,     X86::SUB32mi,    0 },
     { X86::SUB32ri8,    X86::SUB32mi8,   0 },
     { X86::SUB32rr,     X86::SUB32mr,    0 },
     { X86::SUB64ri32,   X86::SUB64mi32,  0 },
     { X86::SUB64ri8,    X86::SUB64mi8,   0 },
     { X86::SUB64rr,     X86::SUB64mr,    0 },
     { X86::SUB8ri,      X86::SUB8mi,     0 },
     { X86::SUB8rr,      X86::SUB8mr,     0 },
     { X86::XOR16ri,     X86::XOR16mi,    0 },
     { X86::XOR16ri8,    X86::XOR16mi8,   0 },
     { X86::XOR16rr,     X86::XOR16mr,    0 },
     { X86::XOR32ri,     X86::XOR32mi,    0 },
     { X86::XOR32ri8,    X86::XOR32mi8,   0 },
     { X86::XOR32rr,     X86::XOR32mr,    0 },
     { X86::XOR64ri32,   X86::XOR64mi32,  0 },
     { X86::XOR64ri8,    X86::XOR64mi8,   0 },
     { X86::XOR64rr,     X86::XOR64mr,    0 },
     { X86::XOR8ri,      X86::XOR8mi,     0 },
     { X86::XOR8rr,      X86::XOR8mr,     0 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
     unsigned RegOp = OpTbl2Addr[i].RegOp;
     unsigned MemOp = OpTbl2Addr[i].MemOp;
     unsigned Flags = OpTbl2Addr[i].Flags;
     AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 0, folded load and store, no alignment requirement.
                   Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
   }
 
   static const X86OpTblEntry OpTbl0[] = {
     { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
     { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
     { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
     { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD },
     { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD },
     { X86::CMP16ri,     X86::CMP16mi,       TB_FOLDED_LOAD },
     { X86::CMP16ri8,    X86::CMP16mi8,      TB_FOLDED_LOAD },
     { X86::CMP16rr,     X86::CMP16mr,       TB_FOLDED_LOAD },
     { X86::CMP32ri,     X86::CMP32mi,       TB_FOLDED_LOAD },
     { X86::CMP32ri8,    X86::CMP32mi8,      TB_FOLDED_LOAD },
     { X86::CMP32rr,     X86::CMP32mr,       TB_FOLDED_LOAD },
     { X86::CMP64ri32,   X86::CMP64mi32,     TB_FOLDED_LOAD },
     { X86::CMP64ri8,    X86::CMP64mi8,      TB_FOLDED_LOAD },
     { X86::CMP64rr,     X86::CMP64mr,       TB_FOLDED_LOAD },
     { X86::CMP8ri,      X86::CMP8mi,        TB_FOLDED_LOAD },
     { X86::CMP8rr,      X86::CMP8mr,        TB_FOLDED_LOAD },
     { X86::DIV16r,      X86::DIV16m,        TB_FOLDED_LOAD },
     { X86::DIV32r,      X86::DIV32m,        TB_FOLDED_LOAD },
     { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
     { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
     { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE },
     { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
     { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
     { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
     { X86::IDIV8r,      X86::IDIV8m,        TB_FOLDED_LOAD },
     { X86::IMUL16r,     X86::IMUL16m,       TB_FOLDED_LOAD },
     { X86::IMUL32r,     X86::IMUL32m,       TB_FOLDED_LOAD },
     { X86::IMUL64r,     X86::IMUL64m,       TB_FOLDED_LOAD },
     { X86::IMUL8r,      X86::IMUL8m,        TB_FOLDED_LOAD },
     { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD },
     { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD },
     { X86::MOV16ri,     X86::MOV16mi,       TB_FOLDED_STORE },
     { X86::MOV16rr,     X86::MOV16mr,       TB_FOLDED_STORE },
     { X86::MOV32ri,     X86::MOV32mi,       TB_FOLDED_STORE },
     { X86::MOV32rr,     X86::MOV32mr,       TB_FOLDED_STORE },
     { X86::MOV64ri32,   X86::MOV64mi32,     TB_FOLDED_STORE },
     { X86::MOV64rr,     X86::MOV64mr,       TB_FOLDED_STORE },
     { X86::MOV8ri,      X86::MOV8mi,        TB_FOLDED_STORE },
     { X86::MOV8rr,      X86::MOV8mr,        TB_FOLDED_STORE },
     { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
     { X86::MOVAPDrr,    X86::MOVAPDmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::MOVAPSrr,    X86::MOVAPSmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::MOVDQArr,    X86::MOVDQAmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::MOVPDI2DIrr, X86::MOVPDI2DImr,   TB_FOLDED_STORE },
     { X86::MOVPQIto64rr,X86::MOVPQI2QImr,   TB_FOLDED_STORE },
     { X86::MOVSDto64rr, X86::MOVSDto64mr,   TB_FOLDED_STORE },
     { X86::MOVSS2DIrr,  X86::MOVSS2DImr,    TB_FOLDED_STORE },
     { X86::MOVUPDrr,    X86::MOVUPDmr,      TB_FOLDED_STORE },
     { X86::MOVUPSrr,    X86::MOVUPSmr,      TB_FOLDED_STORE },
     { X86::MUL16r,      X86::MUL16m,        TB_FOLDED_LOAD },
     { X86::MUL32r,      X86::MUL32m,        TB_FOLDED_LOAD },
     { X86::MUL64r,      X86::MUL64m,        TB_FOLDED_LOAD },
     { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
     { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
     { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
     { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
     { X86::SETBr,       X86::SETBm,         TB_FOLDED_STORE },
     { X86::SETEr,       X86::SETEm,         TB_FOLDED_STORE },
     { X86::SETGEr,      X86::SETGEm,        TB_FOLDED_STORE },
     { X86::SETGr,       X86::SETGm,         TB_FOLDED_STORE },
     { X86::SETLEr,      X86::SETLEm,        TB_FOLDED_STORE },
     { X86::SETLr,       X86::SETLm,         TB_FOLDED_STORE },
     { X86::SETNEr,      X86::SETNEm,        TB_FOLDED_STORE },
     { X86::SETNOr,      X86::SETNOm,        TB_FOLDED_STORE },
     { X86::SETNPr,      X86::SETNPm,        TB_FOLDED_STORE },
     { X86::SETNSr,      X86::SETNSm,        TB_FOLDED_STORE },
     { X86::SETOr,       X86::SETOm,         TB_FOLDED_STORE },
     { X86::SETPr,       X86::SETPm,         TB_FOLDED_STORE },
     { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
     { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
     { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
     { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
     { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
     { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
     { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
     // AVX 128-bit versions of foldable instructions
     { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
     { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVDQArr,   X86::VMOVDQAmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr,  TB_FOLDED_STORE },
     { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
     { X86::VMOVSDto64rr,X86::VMOVSDto64mr,  TB_FOLDED_STORE },
     { X86::VMOVSS2DIrr, X86::VMOVSS2DImr,   TB_FOLDED_STORE },
     { X86::VMOVUPDrr,   X86::VMOVUPDmr,     TB_FOLDED_STORE },
     { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
     // AVX 256-bit foldable instructions
     { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
     { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
     // AVX-512 foldable instructions
     { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
     { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVAPSZrr,      X86::VMOVAPSZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
     { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
     { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zmr,   TB_FOLDED_STORE },
     { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
     { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
     { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
     // AVX-512 foldable instructions (256-bit versions)
     { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVDQA32Z256rr,    X86::VMOVDQA32Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVDQA64Z256rr,    X86::VMOVDQA64Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256mr,    TB_FOLDED_STORE },
     { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256mr,    TB_FOLDED_STORE },
     { X86::VMOVDQU8Z256rr,     X86::VMOVDQU8Z256mr,   TB_FOLDED_STORE },
     { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256mr,  TB_FOLDED_STORE },
     { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256mr,  TB_FOLDED_STORE },
     { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256mr,  TB_FOLDED_STORE },
     // AVX-512 foldable instructions (128-bit versions)
     { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVDQA32Z128rr,    X86::VMOVDQA32Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVDQA64Z128rr,    X86::VMOVDQA64Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128mr,    TB_FOLDED_STORE },
     { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128mr,    TB_FOLDED_STORE },
     { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128mr,   TB_FOLDED_STORE },
     { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128mr,  TB_FOLDED_STORE },
     { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128mr,  TB_FOLDED_STORE },
     { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128mr,  TB_FOLDED_STORE }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
     unsigned RegOp      = OpTbl0[i].RegOp;
     unsigned MemOp      = OpTbl0[i].MemOp;
     unsigned Flags      = OpTbl0[i].Flags;
     AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
                   RegOp, MemOp, TB_INDEX_0 | Flags);
   }
 
   static const X86OpTblEntry OpTbl1[] = {
     { X86::CMP16rr,         X86::CMP16rm,             0 },
     { X86::CMP32rr,         X86::CMP32rm,             0 },
     { X86::CMP64rr,         X86::CMP64rm,             0 },
     { X86::CMP8rr,          X86::CMP8rm,              0 },
     { X86::CVTSD2SSrr,      X86::CVTSD2SSrm,          0 },
     { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm,        0 },
     { X86::CVTSI2SDrr,      X86::CVTSI2SDrm,          0 },
     { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm,        0 },
     { X86::CVTSI2SSrr,      X86::CVTSI2SSrm,          0 },
     { X86::CVTSS2SDrr,      X86::CVTSS2SDrm,          0 },
     { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm,       0 },
     { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
     { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
     { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
     { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
     { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
     { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
     { X86::IMUL32rri8,      X86::IMUL32rmi8,          0 },
     { X86::IMUL64rri32,     X86::IMUL64rmi32,         0 },
     { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
     { X86::Int_COMISDrr,    X86::Int_COMISDrm,        0 },
     { X86::Int_COMISSrr,    X86::Int_COMISSrm,        0 },
     { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm,        0 },
     { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
     { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        0 },
     { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          0 },
     { X86::CVTDQ2PSrr,      X86::CVTDQ2PSrm,          TB_ALIGN_16 },
     { X86::CVTPD2DQrr,      X86::CVTPD2DQrm,          TB_ALIGN_16 },
     { X86::CVTPD2PSrr,      X86::CVTPD2PSrm,          TB_ALIGN_16 },
     { X86::CVTPS2DQrr,      X86::CVTPS2DQrm,          TB_ALIGN_16 },
     { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
     { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
     { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
     { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm,     0 },
     { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm,  0 },
     { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm,     0 },
     { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm,       0 },
     { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm,       0 },
     { X86::MOV16rr,         X86::MOV16rm,             0 },
     { X86::MOV32rr,         X86::MOV32rm,             0 },
     { X86::MOV64rr,         X86::MOV64rm,             0 },
     { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm,         0 },
     { X86::MOV64toSDrr,     X86::MOV64toSDrm,         0 },
     { X86::MOV8rr,          X86::MOV8rm,              0 },
     { X86::MOVAPDrr,        X86::MOVAPDrm,            TB_ALIGN_16 },
     { X86::MOVAPSrr,        X86::MOVAPSrm,            TB_ALIGN_16 },
     { X86::MOVDDUPrr,       X86::MOVDDUPrm,           0 },
     { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm,         0 },
     { X86::MOVDI2SSrr,      X86::MOVDI2SSrm,          0 },
     { X86::MOVDQArr,        X86::MOVDQArm,            TB_ALIGN_16 },
     { X86::MOVSHDUPrr,      X86::MOVSHDUPrm,          TB_ALIGN_16 },
     { X86::MOVSLDUPrr,      X86::MOVSLDUPrm,          TB_ALIGN_16 },
     { X86::MOVSX16rr8,      X86::MOVSX16rm8,          0 },
     { X86::MOVSX32rr16,     X86::MOVSX32rm16,         0 },
     { X86::MOVSX32rr8,      X86::MOVSX32rm8,          0 },
     { X86::MOVSX64rr16,     X86::MOVSX64rm16,         0 },
     { X86::MOVSX64rr32,     X86::MOVSX64rm32,         0 },
     { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
     { X86::MOVUPDrr,        X86::MOVUPDrm,            TB_ALIGN_16 },
     { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
     { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm,        0 },
     { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm,     TB_ALIGN_16 },
     { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
     { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
     { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
     { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
     { X86::PABSBrr128,      X86::PABSBrm128,          TB_ALIGN_16 },
     { X86::PABSDrr128,      X86::PABSDrm128,          TB_ALIGN_16 },
     { X86::PABSWrr128,      X86::PABSWrm128,          TB_ALIGN_16 },
     { X86::PSHUFDri,        X86::PSHUFDmi,            TB_ALIGN_16 },
     { X86::PSHUFHWri,       X86::PSHUFHWmi,           TB_ALIGN_16 },
     { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
     { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
     { X86::RCPPSr_Int,      X86::RCPPSm_Int,          TB_ALIGN_16 },
     { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
     { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int,        TB_ALIGN_16 },
     { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
     { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        0 },
     { X86::SQRTPDr,         X86::SQRTPDm,             TB_ALIGN_16 },
     { X86::SQRTPSr,         X86::SQRTPSm,             TB_ALIGN_16 },
     { X86::SQRTSDr,         X86::SQRTSDm,             0 },
     { X86::SQRTSDr_Int,     X86::SQRTSDm_Int,         0 },
     { X86::SQRTSSr,         X86::SQRTSSm,             0 },
     { X86::SQRTSSr_Int,     X86::SQRTSSm_Int,         0 },
     { X86::TEST16rr,        X86::TEST16rm,            0 },
     { X86::TEST32rr,        X86::TEST32rm,            0 },
     { X86::TEST64rr,        X86::TEST64rm,            0 },
     { X86::TEST8rr,         X86::TEST8rm,             0 },
     // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
     { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
     { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
     // AVX 128-bit versions of foldable instructions
     { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       0 },
     { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       0 },
     { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      0 },
     { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      0 },
     { X86::VCVTTSD2SI64rr,  X86::VCVTTSD2SI64rm,      0 },
     { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 },
     { X86::VCVTTSD2SIrr,    X86::VCVTTSD2SIrm,        0 },
     { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm,    0 },
     { X86::VCVTTSS2SI64rr,  X86::VCVTTSS2SI64rm,      0 },
     { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 },
     { X86::VCVTTSS2SIrr,    X86::VCVTTSS2SIrm,        0 },
     { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm,    0 },
     { X86::VCVTSD2SI64rr,   X86::VCVTSD2SI64rm,       0 },
     { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
     { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
     { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
     { X86::VCVTDQ2PSrr,     X86::VCVTDQ2PSrm,         0 },
     { X86::VCVTPD2DQrr,     X86::VCVTPD2DQXrm,        0 },
     { X86::VCVTPD2PSrr,     X86::VCVTPD2PSXrm,        0 },
     { X86::VCVTPS2DQrr,     X86::VCVTPS2DQrm,         0 },
     { X86::VCVTTPD2DQrr,    X86::VCVTTPD2DQXrm,       0 },
     { X86::VCVTTPS2DQrr,    X86::VCVTTPS2DQrm,        0 },
     { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
     { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
     { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
     { X86::VMOVAPSrr,       X86::VMOVAPSrm,           TB_ALIGN_16 },
     { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          0 },
     { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
     { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
     { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
     { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         TB_ALIGN_16 },
     { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         TB_ALIGN_16 },
     { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
     { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
     { X86::VMOVZQI2PQIrr,   X86::VMOVZQI2PQIrm,       0 },
     { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm,    TB_ALIGN_16 },
     { X86::VPABSBrr128,     X86::VPABSBrm128,         0 },
     { X86::VPABSDrr128,     X86::VPABSDrm128,         0 },
     { X86::VPABSWrr128,     X86::VPABSWrm128,         0 },
     { X86::VPERMILPDri,     X86::VPERMILPDmi,         0 },
     { X86::VPERMILPSri,     X86::VPERMILPSmi,         0 },
     { X86::VPSHUFDri,       X86::VPSHUFDmi,           0 },
     { X86::VPSHUFHWri,      X86::VPSHUFHWmi,          0 },
     { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          0 },
     { X86::VRCPPSr,         X86::VRCPPSm,             0 },
     { X86::VRCPPSr_Int,     X86::VRCPPSm_Int,         0 },
     { X86::VRSQRTPSr,       X86::VRSQRTPSm,           0 },
     { X86::VRSQRTPSr_Int,   X86::VRSQRTPSm_Int,       0 },
     { X86::VSQRTPDr,        X86::VSQRTPDm,            0 },
     { X86::VSQRTPSr,        X86::VSQRTPSm,            0 },
     { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
     { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
     { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
 
     // AVX 256-bit foldable instructions
     { X86::VCVTDQ2PSYrr,    X86::VCVTDQ2PSYrm,        0 },
     { X86::VCVTPD2DQYrr,    X86::VCVTPD2DQYrm,        0 },
     { X86::VCVTPD2PSYrr,    X86::VCVTPD2PSYrm,        0 },
     { X86::VCVTPS2DQYrr,    X86::VCVTPS2DQYrm,        0 },
     { X86::VCVTTPD2DQYrr,   X86::VCVTTPD2DQYrm,       0 },
     { X86::VCVTTPS2DQYrr,   X86::VCVTTPS2DQYrm,       0 },
     { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
     { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
     { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_32 },
     { X86::VMOVUPDYrr,      X86::VMOVUPDYrm,          0 },
     { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
     { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
     { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
     { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
     { X86::VRCPPSYr_Int,    X86::VRCPPSYm_Int,        0 },
     { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
     { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
     { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
     { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
     { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
 
     // AVX2 foldable instructions
     { X86::VPABSBrr256,     X86::VPABSBrm256,         0 },
     { X86::VPABSDrr256,     X86::VPABSDrm256,         0 },
     { X86::VPABSWrr256,     X86::VPABSWrm256,         0 },
     { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
     { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
     { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
 
     // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
     { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
     { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
     { X86::BEXTRI32ri,      X86::BEXTRI32mi,          0 },
     { X86::BEXTRI64ri,      X86::BEXTRI64mi,          0 },
     { X86::BLCFILL32rr,     X86::BLCFILL32rm,         0 },
     { X86::BLCFILL64rr,     X86::BLCFILL64rm,         0 },
     { X86::BLCI32rr,        X86::BLCI32rm,            0 },
     { X86::BLCI64rr,        X86::BLCI64rm,            0 },
     { X86::BLCIC32rr,       X86::BLCIC32rm,           0 },
     { X86::BLCIC64rr,       X86::BLCIC64rm,           0 },
     { X86::BLCMSK32rr,      X86::BLCMSK32rm,          0 },
     { X86::BLCMSK64rr,      X86::BLCMSK64rm,          0 },
     { X86::BLCS32rr,        X86::BLCS32rm,            0 },
     { X86::BLCS64rr,        X86::BLCS64rm,            0 },
     { X86::BLSFILL32rr,     X86::BLSFILL32rm,         0 },
     { X86::BLSFILL64rr,     X86::BLSFILL64rm,         0 },
     { X86::BLSI32rr,        X86::BLSI32rm,            0 },
     { X86::BLSI64rr,        X86::BLSI64rm,            0 },
     { X86::BLSIC32rr,       X86::BLSIC32rm,           0 },
     { X86::BLSIC64rr,       X86::BLSIC64rm,           0 },
     { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
     { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
     { X86::BLSR32rr,        X86::BLSR32rm,            0 },
     { X86::BLSR64rr,        X86::BLSR64rm,            0 },
     { X86::BZHI32rr,        X86::BZHI32rm,            0 },
     { X86::BZHI64rr,        X86::BZHI64rm,            0 },
     { X86::LZCNT16rr,       X86::LZCNT16rm,           0 },
     { X86::LZCNT32rr,       X86::LZCNT32rm,           0 },
     { X86::LZCNT64rr,       X86::LZCNT64rm,           0 },
     { X86::POPCNT16rr,      X86::POPCNT16rm,          0 },
     { X86::POPCNT32rr,      X86::POPCNT32rm,          0 },
     { X86::POPCNT64rr,      X86::POPCNT64rm,          0 },
     { X86::RORX32ri,        X86::RORX32mi,            0 },
     { X86::RORX64ri,        X86::RORX64mi,            0 },
     { X86::SARX32rr,        X86::SARX32rm,            0 },
     { X86::SARX64rr,        X86::SARX64rm,            0 },
     { X86::SHRX32rr,        X86::SHRX32rm,            0 },
     { X86::SHRX64rr,        X86::SHRX64rm,            0 },
     { X86::SHLX32rr,        X86::SHLX32rm,            0 },
     { X86::SHLX64rr,        X86::SHLX64rm,            0 },
     { X86::T1MSKC32rr,      X86::T1MSKC32rm,          0 },
     { X86::T1MSKC64rr,      X86::T1MSKC64rm,          0 },
     { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
     { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
     { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
     { X86::TZMSK32rr,       X86::TZMSK32rm,           0 },
     { X86::TZMSK64rr,       X86::TZMSK64rm,           0 },
 
     // AVX-512 foldable instructions
     { X86::VMOV64toPQIZrr,  X86::VMOVQI2PQIZrm,       0 },
     { X86::VMOVDI2SSZrr,    X86::VMOVDI2SSZrm,        0 },
     { X86::VMOVAPDZrr,      X86::VMOVAPDZrm,          TB_ALIGN_64 },
     { X86::VMOVAPSZrr,      X86::VMOVAPSZrm,          TB_ALIGN_64 },
     { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zrm,        TB_ALIGN_64 },
     { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zrm,        TB_ALIGN_64 },
     { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zrm,         0 },
     { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zrm,        0 },
     { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zrm,        0 },
     { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zrm,        0 },
     { X86::VMOVUPDZrr,      X86::VMOVUPDZrm,          0 },
     { X86::VMOVUPSZrr,      X86::VMOVUPSZrm,          0 },
     { X86::VPABSDZrr,       X86::VPABSDZrm,           0 },
     { X86::VPABSQZrr,       X86::VPABSQZrm,           0 },
     { X86::VBROADCASTSSZr,  X86::VBROADCASTSSZm,      TB_NO_REVERSE },
     { X86::VBROADCASTSDZr,  X86::VBROADCASTSDZm,      TB_NO_REVERSE },
     // AVX-512 foldable instructions (256-bit versions)
     { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256rm,          TB_ALIGN_32 },
     { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256rm,          TB_ALIGN_32 },
     { X86::VMOVDQA32Z256rr,    X86::VMOVDQA32Z256rm,        TB_ALIGN_32 },
     { X86::VMOVDQA64Z256rr,    X86::VMOVDQA64Z256rm,        TB_ALIGN_32 },
     { X86::VMOVDQU8Z256rr,     X86::VMOVDQU8Z256rm,         0 },
     { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256rm,        0 },
     { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256rm,        0 },
     { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256rm,        0 },
     { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256rm,          0 },
     { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256rm,          0 },
     { X86::VBROADCASTSSZ256r,  X86::VBROADCASTSSZ256m,      TB_NO_REVERSE },
     { X86::VBROADCASTSDZ256r,  X86::VBROADCASTSDZ256m,      TB_NO_REVERSE },
     // AVX-512 foldable instructions (256-bit versions)
     { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128rm,          TB_ALIGN_16 },
     { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128rm,          TB_ALIGN_16 },
     { X86::VMOVDQA32Z128rr,    X86::VMOVDQA32Z128rm,        TB_ALIGN_16 },
     { X86::VMOVDQA64Z128rr,    X86::VMOVDQA64Z128rm,        TB_ALIGN_16 },
     { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128rm,         0 },
     { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128rm,        0 },
     { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128rm,        0 },
     { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128rm,        0 },
     { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128rm,          0 },
     { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128rm,          0 },
     { X86::VBROADCASTSSZ128r,  X86::VBROADCASTSSZ128m,      TB_NO_REVERSE },
 
     // AES foldable instructions
     { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
     { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
     { X86::VAESIMCrr,             X86::VAESIMCrm,             TB_ALIGN_16 },
     { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
     unsigned RegOp = OpTbl1[i].RegOp;
     unsigned MemOp = OpTbl1[i].MemOp;
     unsigned Flags = OpTbl1[i].Flags;
     AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 1, folded load
                   Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
   }
 
   static const X86OpTblEntry OpTbl2[] = {
     { X86::ADC32rr,         X86::ADC32rm,       0 },
     { X86::ADC64rr,         X86::ADC64rm,       0 },
     { X86::ADD16rr,         X86::ADD16rm,       0 },
     { X86::ADD16rr_DB,      X86::ADD16rm,       TB_NO_REVERSE },
     { X86::ADD32rr,         X86::ADD32rm,       0 },
     { X86::ADD32rr_DB,      X86::ADD32rm,       TB_NO_REVERSE },
     { X86::ADD64rr,         X86::ADD64rm,       0 },
     { X86::ADD64rr_DB,      X86::ADD64rm,       TB_NO_REVERSE },
     { X86::ADD8rr,          X86::ADD8rm,        0 },
     { X86::ADDPDrr,         X86::ADDPDrm,       TB_ALIGN_16 },
     { X86::ADDPSrr,         X86::ADDPSrm,       TB_ALIGN_16 },
     { X86::ADDSDrr,         X86::ADDSDrm,       0 },
     { X86::ADDSSrr,         X86::ADDSSrm,       0 },
     { X86::ADDSUBPDrr,      X86::ADDSUBPDrm,    TB_ALIGN_16 },
     { X86::ADDSUBPSrr,      X86::ADDSUBPSrm,    TB_ALIGN_16 },
     { X86::AND16rr,         X86::AND16rm,       0 },
     { X86::AND32rr,         X86::AND32rm,       0 },
     { X86::AND64rr,         X86::AND64rm,       0 },
     { X86::AND8rr,          X86::AND8rm,        0 },
     { X86::ANDNPDrr,        X86::ANDNPDrm,      TB_ALIGN_16 },
     { X86::ANDNPSrr,        X86::ANDNPSrm,      TB_ALIGN_16 },
     { X86::ANDPDrr,         X86::ANDPDrm,       TB_ALIGN_16 },
     { X86::ANDPSrr,         X86::ANDPSrm,       TB_ALIGN_16 },
     { X86::BLENDPDrri,      X86::BLENDPDrmi,    TB_ALIGN_16 },
     { X86::BLENDPSrri,      X86::BLENDPSrmi,    TB_ALIGN_16 },
     { X86::BLENDVPDrr0,     X86::BLENDVPDrm0,   TB_ALIGN_16 },
     { X86::BLENDVPSrr0,     X86::BLENDVPSrm0,   TB_ALIGN_16 },
     { X86::CMOVA16rr,       X86::CMOVA16rm,     0 },
     { X86::CMOVA32rr,       X86::CMOVA32rm,     0 },
     { X86::CMOVA64rr,       X86::CMOVA64rm,     0 },
     { X86::CMOVAE16rr,      X86::CMOVAE16rm,    0 },
     { X86::CMOVAE32rr,      X86::CMOVAE32rm,    0 },
     { X86::CMOVAE64rr,      X86::CMOVAE64rm,    0 },
     { X86::CMOVB16rr,       X86::CMOVB16rm,     0 },
     { X86::CMOVB32rr,       X86::CMOVB32rm,     0 },
     { X86::CMOVB64rr,       X86::CMOVB64rm,     0 },
     { X86::CMOVBE16rr,      X86::CMOVBE16rm,    0 },
     { X86::CMOVBE32rr,      X86::CMOVBE32rm,    0 },
     { X86::CMOVBE64rr,      X86::CMOVBE64rm,    0 },
     { X86::CMOVE16rr,       X86::CMOVE16rm,     0 },
     { X86::CMOVE32rr,       X86::CMOVE32rm,     0 },
     { X86::CMOVE64rr,       X86::CMOVE64rm,     0 },
     { X86::CMOVG16rr,       X86::CMOVG16rm,     0 },
     { X86::CMOVG32rr,       X86::CMOVG32rm,     0 },
     { X86::CMOVG64rr,       X86::CMOVG64rm,     0 },
     { X86::CMOVGE16rr,      X86::CMOVGE16rm,    0 },
     { X86::CMOVGE32rr,      X86::CMOVGE32rm,    0 },
     { X86::CMOVGE64rr,      X86::CMOVGE64rm,    0 },
     { X86::CMOVL16rr,       X86::CMOVL16rm,     0 },
     { X86::CMOVL32rr,       X86::CMOVL32rm,     0 },
     { X86::CMOVL64rr,       X86::CMOVL64rm,     0 },
     { X86::CMOVLE16rr,      X86::CMOVLE16rm,    0 },
     { X86::CMOVLE32rr,      X86::CMOVLE32rm,    0 },
     { X86::CMOVLE64rr,      X86::CMOVLE64rm,    0 },
     { X86::CMOVNE16rr,      X86::CMOVNE16rm,    0 },
     { X86::CMOVNE32rr,      X86::CMOVNE32rm,    0 },
     { X86::CMOVNE64rr,      X86::CMOVNE64rm,    0 },
     { X86::CMOVNO16rr,      X86::CMOVNO16rm,    0 },
     { X86::CMOVNO32rr,      X86::CMOVNO32rm,    0 },
     { X86::CMOVNO64rr,      X86::CMOVNO64rm,    0 },
     { X86::CMOVNP16rr,      X86::CMOVNP16rm,    0 },
     { X86::CMOVNP32rr,      X86::CMOVNP32rm,    0 },
     { X86::CMOVNP64rr,      X86::CMOVNP64rm,    0 },
     { X86::CMOVNS16rr,      X86::CMOVNS16rm,    0 },
     { X86::CMOVNS32rr,      X86::CMOVNS32rm,    0 },
     { X86::CMOVNS64rr,      X86::CMOVNS64rm,    0 },
     { X86::CMOVO16rr,       X86::CMOVO16rm,     0 },
     { X86::CMOVO32rr,       X86::CMOVO32rm,     0 },
     { X86::CMOVO64rr,       X86::CMOVO64rm,     0 },
     { X86::CMOVP16rr,       X86::CMOVP16rm,     0 },
     { X86::CMOVP32rr,       X86::CMOVP32rm,     0 },
     { X86::CMOVP64rr,       X86::CMOVP64rm,     0 },
     { X86::CMOVS16rr,       X86::CMOVS16rm,     0 },
     { X86::CMOVS32rr,       X86::CMOVS32rm,     0 },
     { X86::CMOVS64rr,       X86::CMOVS64rm,     0 },
     { X86::CMPPDrri,        X86::CMPPDrmi,      TB_ALIGN_16 },
     { X86::CMPPSrri,        X86::CMPPSrmi,      TB_ALIGN_16 },
     { X86::CMPSDrr,         X86::CMPSDrm,       0 },
     { X86::CMPSSrr,         X86::CMPSSrm,       0 },
     { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
     { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
     { X86::DIVSDrr,         X86::DIVSDrm,       0 },
     { X86::DIVSSrr,         X86::DIVSSrm,       0 },
     { X86::FsANDNPDrr,      X86::FsANDNPDrm,    TB_ALIGN_16 },
     { X86::FsANDNPSrr,      X86::FsANDNPSrm,    TB_ALIGN_16 },
     { X86::FsANDPDrr,       X86::FsANDPDrm,     TB_ALIGN_16 },
     { X86::FsANDPSrr,       X86::FsANDPSrm,     TB_ALIGN_16 },
     { X86::FsORPDrr,        X86::FsORPDrm,      TB_ALIGN_16 },
     { X86::FsORPSrr,        X86::FsORPSrm,      TB_ALIGN_16 },
     { X86::FsXORPDrr,       X86::FsXORPDrm,     TB_ALIGN_16 },
     { X86::FsXORPSrr,       X86::FsXORPSrm,     TB_ALIGN_16 },
     { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
     { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
     { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
     { X86::HSUBPSrr,        X86::HSUBPSrm,      TB_ALIGN_16 },
     { X86::IMUL16rr,        X86::IMUL16rm,      0 },
     { X86::IMUL32rr,        X86::IMUL32rm,      0 },
     { X86::IMUL64rr,        X86::IMUL64rm,      0 },
     { X86::Int_CMPSDrr,     X86::Int_CMPSDrm,   0 },
     { X86::Int_CMPSSrr,     X86::Int_CMPSSrm,   0 },
     { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      0 },
     { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
     { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
     { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
     { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
     { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      0 },
     { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
     { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
     { X86::MAXSDrr,         X86::MAXSDrm,       0 },
     { X86::MAXSSrr,         X86::MAXSSrm,       0 },
     { X86::MINPDrr,         X86::MINPDrm,       TB_ALIGN_16 },
     { X86::MINPSrr,         X86::MINPSrm,       TB_ALIGN_16 },
     { X86::MINSDrr,         X86::MINSDrm,       0 },
     { X86::MINSSrr,         X86::MINSSrm,       0 },
     { X86::MPSADBWrri,      X86::MPSADBWrmi,    TB_ALIGN_16 },
     { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
     { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
     { X86::MULSDrr,         X86::MULSDrm,       0 },
     { X86::MULSSrr,         X86::MULSSrm,       0 },
     { X86::OR16rr,          X86::OR16rm,        0 },
     { X86::OR32rr,          X86::OR32rm,        0 },
     { X86::OR64rr,          X86::OR64rm,        0 },
     { X86::OR8rr,           X86::OR8rm,         0 },
     { X86::ORPDrr,          X86::ORPDrm,        TB_ALIGN_16 },
     { X86::ORPSrr,          X86::ORPSrm,        TB_ALIGN_16 },
     { X86::PACKSSDWrr,      X86::PACKSSDWrm,    TB_ALIGN_16 },
     { X86::PACKSSWBrr,      X86::PACKSSWBrm,    TB_ALIGN_16 },
     { X86::PACKUSDWrr,      X86::PACKUSDWrm,    TB_ALIGN_16 },
     { X86::PACKUSWBrr,      X86::PACKUSWBrm,    TB_ALIGN_16 },
     { X86::PADDBrr,         X86::PADDBrm,       TB_ALIGN_16 },
     { X86::PADDDrr,         X86::PADDDrm,       TB_ALIGN_16 },
     { X86::PADDQrr,         X86::PADDQrm,       TB_ALIGN_16 },
     { X86::PADDSBrr,        X86::PADDSBrm,      TB_ALIGN_16 },
     { X86::PADDSWrr,        X86::PADDSWrm,      TB_ALIGN_16 },
     { X86::PADDUSBrr,       X86::PADDUSBrm,     TB_ALIGN_16 },
     { X86::PADDUSWrr,       X86::PADDUSWrm,     TB_ALIGN_16 },
     { X86::PADDWrr,         X86::PADDWrm,       TB_ALIGN_16 },
     { X86::PALIGNR128rr,    X86::PALIGNR128rm,  TB_ALIGN_16 },
     { X86::PANDNrr,         X86::PANDNrm,       TB_ALIGN_16 },
     { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
     { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
     { X86::PAVGWrr,         X86::PAVGWrm,       TB_ALIGN_16 },
     { X86::PBLENDWrri,      X86::PBLENDWrmi,    TB_ALIGN_16 },
     { X86::PCMPEQBrr,       X86::PCMPEQBrm,     TB_ALIGN_16 },
     { X86::PCMPEQDrr,       X86::PCMPEQDrm,     TB_ALIGN_16 },
     { X86::PCMPEQQrr,       X86::PCMPEQQrm,     TB_ALIGN_16 },
     { X86::PCMPEQWrr,       X86::PCMPEQWrm,     TB_ALIGN_16 },
     { X86::PCMPGTBrr,       X86::PCMPGTBrm,     TB_ALIGN_16 },
     { X86::PCMPGTDrr,       X86::PCMPGTDrm,     TB_ALIGN_16 },
     { X86::PCMPGTQrr,       X86::PCMPGTQrm,     TB_ALIGN_16 },
     { X86::PCMPGTWrr,       X86::PCMPGTWrm,     TB_ALIGN_16 },
     { X86::PHADDDrr,        X86::PHADDDrm,      TB_ALIGN_16 },
     { X86::PHADDWrr,        X86::PHADDWrm,      TB_ALIGN_16 },
     { X86::PHADDSWrr128,    X86::PHADDSWrm128,  TB_ALIGN_16 },
     { X86::PHSUBDrr,        X86::PHSUBDrm,      TB_ALIGN_16 },
     { X86::PHSUBSWrr128,    X86::PHSUBSWrm128,  TB_ALIGN_16 },
     { X86::PHSUBWrr,        X86::PHSUBWrm,      TB_ALIGN_16 },
     { X86::PINSRWrri,       X86::PINSRWrmi,     TB_ALIGN_16 },
     { X86::PMADDUBSWrr128,  X86::PMADDUBSWrm128, TB_ALIGN_16 },
     { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
     { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
     { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
     { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
     { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
     { X86::PMINSBrr,        X86::PMINSBrm,      TB_ALIGN_16 },
     { X86::PMINSDrr,        X86::PMINSDrm,      TB_ALIGN_16 },
     { X86::PMINUDrr,        X86::PMINUDrm,      TB_ALIGN_16 },
     { X86::PMINUWrr,        X86::PMINUWrm,      TB_ALIGN_16 },
     { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
     { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
     { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
     { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
     { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
     { X86::PMULHRSWrr128,   X86::PMULHRSWrm128, TB_ALIGN_16 },
     { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
     { X86::PMULHWrr,        X86::PMULHWrm,      TB_ALIGN_16 },
     { X86::PMULLDrr,        X86::PMULLDrm,      TB_ALIGN_16 },
     { X86::PMULLWrr,        X86::PMULLWrm,      TB_ALIGN_16 },
     { X86::PMULUDQrr,       X86::PMULUDQrm,     TB_ALIGN_16 },
     { X86::PORrr,           X86::PORrm,         TB_ALIGN_16 },
     { X86::PSADBWrr,        X86::PSADBWrm,      TB_ALIGN_16 },
     { X86::PSHUFBrr,        X86::PSHUFBrm,      TB_ALIGN_16 },
     { X86::PSIGNBrr,        X86::PSIGNBrm,      TB_ALIGN_16 },
     { X86::PSIGNWrr,        X86::PSIGNWrm,      TB_ALIGN_16 },
     { X86::PSIGNDrr,        X86::PSIGNDrm,      TB_ALIGN_16 },
     { X86::PSLLDrr,         X86::PSLLDrm,       TB_ALIGN_16 },
     { X86::PSLLQrr,         X86::PSLLQrm,       TB_ALIGN_16 },
     { X86::PSLLWrr,         X86::PSLLWrm,       TB_ALIGN_16 },
     { X86::PSRADrr,         X86::PSRADrm,       TB_ALIGN_16 },
     { X86::PSRAWrr,         X86::PSRAWrm,       TB_ALIGN_16 },
     { X86::PSRLDrr,         X86::PSRLDrm,       TB_ALIGN_16 },
     { X86::PSRLQrr,         X86::PSRLQrm,       TB_ALIGN_16 },
     { X86::PSRLWrr,         X86::PSRLWrm,       TB_ALIGN_16 },
     { X86::PSUBBrr,         X86::PSUBBrm,       TB_ALIGN_16 },
     { X86::PSUBDrr,         X86::PSUBDrm,       TB_ALIGN_16 },
     { X86::PSUBSBrr,        X86::PSUBSBrm,      TB_ALIGN_16 },
     { X86::PSUBSWrr,        X86::PSUBSWrm,      TB_ALIGN_16 },
     { X86::PSUBWrr,         X86::PSUBWrm,       TB_ALIGN_16 },
     { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm,   TB_ALIGN_16 },
     { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm,   TB_ALIGN_16 },
     { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm,  TB_ALIGN_16 },
     { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm,   TB_ALIGN_16 },
     { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm,   TB_ALIGN_16 },
     { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm,   TB_ALIGN_16 },
     { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
     { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
     { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
     { X86::SBB32rr,         X86::SBB32rm,       0 },
     { X86::SBB64rr,         X86::SBB64rm,       0 },
     { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
     { X86::SHUFPSrri,       X86::SHUFPSrmi,     TB_ALIGN_16 },
     { X86::SUB16rr,         X86::SUB16rm,       0 },
     { X86::SUB32rr,         X86::SUB32rm,       0 },
     { X86::SUB64rr,         X86::SUB64rm,       0 },
     { X86::SUB8rr,          X86::SUB8rm,        0 },
     { X86::SUBPDrr,         X86::SUBPDrm,       TB_ALIGN_16 },
     { X86::SUBPSrr,         X86::SUBPSrm,       TB_ALIGN_16 },
     { X86::SUBSDrr,         X86::SUBSDrm,       0 },
     { X86::SUBSSrr,         X86::SUBSSrm,       0 },
     // FIXME: TEST*rr -> swapped operand of TEST*mr.
     { X86::UNPCKHPDrr,      X86::UNPCKHPDrm,    TB_ALIGN_16 },
     { X86::UNPCKHPSrr,      X86::UNPCKHPSrm,    TB_ALIGN_16 },
     { X86::UNPCKLPDrr,      X86::UNPCKLPDrm,    TB_ALIGN_16 },
     { X86::UNPCKLPSrr,      X86::UNPCKLPSrm,    TB_ALIGN_16 },
     { X86::XOR16rr,         X86::XOR16rm,       0 },
     { X86::XOR32rr,         X86::XOR32rm,       0 },
     { X86::XOR64rr,         X86::XOR64rm,       0 },
     { X86::XOR8rr,          X86::XOR8rm,        0 },
     { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
     { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
     // AVX 128-bit versions of foldable instructions
     { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
     { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    0 },
     { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
     { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
     { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
     { X86::Int_VCVTSI2SDrr,   X86::Int_VCVTSI2SDrm,    0 },
     { X86::VCVTSI2SS64rr,     X86::VCVTSI2SS64rm,      0 },
     { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
     { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
     { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
     { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
     { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    0 },
     { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
     { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
     { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
     { X86::VADDPDrr,          X86::VADDPDrm,           0 },
     { X86::VADDPSrr,          X86::VADDPSrm,           0 },
     { X86::VADDSDrr,          X86::VADDSDrm,           0 },
     { X86::VADDSSrr,          X86::VADDSSrm,           0 },
     { X86::VADDSUBPDrr,       X86::VADDSUBPDrm,        0 },
     { X86::VADDSUBPSrr,       X86::VADDSUBPSrm,        0 },
     { X86::VANDNPDrr,         X86::VANDNPDrm,          0 },
     { X86::VANDNPSrr,         X86::VANDNPSrm,          0 },
     { X86::VANDPDrr,          X86::VANDPDrm,           0 },
     { X86::VANDPSrr,          X86::VANDPSrm,           0 },
     { X86::VBLENDPDrri,       X86::VBLENDPDrmi,        0 },
     { X86::VBLENDPSrri,       X86::VBLENDPSrmi,        0 },
     { X86::VBLENDVPDrr,       X86::VBLENDVPDrm,        0 },
     { X86::VBLENDVPSrr,       X86::VBLENDVPSrm,        0 },
     { X86::VCMPPDrri,         X86::VCMPPDrmi,          0 },
     { X86::VCMPPSrri,         X86::VCMPPSrmi,          0 },
     { X86::VCMPSDrr,          X86::VCMPSDrm,           0 },
     { X86::VCMPSSrr,          X86::VCMPSSrm,           0 },
     { X86::VDIVPDrr,          X86::VDIVPDrm,           0 },
     { X86::VDIVPSrr,          X86::VDIVPSrm,           0 },
     { X86::VDIVSDrr,          X86::VDIVSDrm,           0 },
     { X86::VDIVSSrr,          X86::VDIVSSrm,           0 },
     { X86::VFsANDNPDrr,       X86::VFsANDNPDrm,        TB_ALIGN_16 },
     { X86::VFsANDNPSrr,       X86::VFsANDNPSrm,        TB_ALIGN_16 },
     { X86::VFsANDPDrr,        X86::VFsANDPDrm,         TB_ALIGN_16 },
     { X86::VFsANDPSrr,        X86::VFsANDPSrm,         TB_ALIGN_16 },
     { X86::VFsORPDrr,         X86::VFsORPDrm,          TB_ALIGN_16 },
     { X86::VFsORPSrr,         X86::VFsORPSrm,          TB_ALIGN_16 },
     { X86::VFsXORPDrr,        X86::VFsXORPDrm,         TB_ALIGN_16 },
     { X86::VFsXORPSrr,        X86::VFsXORPSrm,         TB_ALIGN_16 },
     { X86::VHADDPDrr,         X86::VHADDPDrm,          0 },
     { X86::VHADDPSrr,         X86::VHADDPSrm,          0 },
     { X86::VHSUBPDrr,         X86::VHSUBPDrm,          0 },
     { X86::VHSUBPSrr,         X86::VHSUBPSrm,          0 },
     { X86::Int_VCMPSDrr,      X86::Int_VCMPSDrm,       0 },
     { X86::Int_VCMPSSrr,      X86::Int_VCMPSSrm,       0 },
     { X86::VMAXPDrr,          X86::VMAXPDrm,           0 },
     { X86::VMAXPSrr,          X86::VMAXPSrm,           0 },
     { X86::VMAXSDrr,          X86::VMAXSDrm,           0 },
     { X86::VMAXSSrr,          X86::VMAXSSrm,           0 },
     { X86::VMINPDrr,          X86::VMINPDrm,           0 },
     { X86::VMINPSrr,          X86::VMINPSrm,           0 },
     { X86::VMINSDrr,          X86::VMINSDrm,           0 },
     { X86::VMINSSrr,          X86::VMINSSrm,           0 },
     { X86::VMPSADBWrri,       X86::VMPSADBWrmi,        0 },
     { X86::VMULPDrr,          X86::VMULPDrm,           0 },
     { X86::VMULPSrr,          X86::VMULPSrm,           0 },
     { X86::VMULSDrr,          X86::VMULSDrm,           0 },
     { X86::VMULSSrr,          X86::VMULSSrm,           0 },
     { X86::VORPDrr,           X86::VORPDrm,            0 },
     { X86::VORPSrr,           X86::VORPSrm,            0 },
     { X86::VPACKSSDWrr,       X86::VPACKSSDWrm,        0 },
     { X86::VPACKSSWBrr,       X86::VPACKSSWBrm,        0 },
     { X86::VPACKUSDWrr,       X86::VPACKUSDWrm,        0 },
     { X86::VPACKUSWBrr,       X86::VPACKUSWBrm,        0 },
     { X86::VPADDBrr,          X86::VPADDBrm,           0 },
     { X86::VPADDDrr,          X86::VPADDDrm,           0 },
     { X86::VPADDQrr,          X86::VPADDQrm,           0 },
     { X86::VPADDSBrr,         X86::VPADDSBrm,          0 },
     { X86::VPADDSWrr,         X86::VPADDSWrm,          0 },
     { X86::VPADDUSBrr,        X86::VPADDUSBrm,         0 },
     { X86::VPADDUSWrr,        X86::VPADDUSWrm,         0 },
     { X86::VPADDWrr,          X86::VPADDWrm,           0 },
     { X86::VPALIGNR128rr,     X86::VPALIGNR128rm,      0 },
     { X86::VPANDNrr,          X86::VPANDNrm,           0 },
     { X86::VPANDrr,           X86::VPANDrm,            0 },
     { X86::VPAVGBrr,          X86::VPAVGBrm,           0 },
     { X86::VPAVGWrr,          X86::VPAVGWrm,           0 },
     { X86::VPBLENDWrri,       X86::VPBLENDWrmi,        0 },
     { X86::VPCMPEQBrr,        X86::VPCMPEQBrm,         0 },
     { X86::VPCMPEQDrr,        X86::VPCMPEQDrm,         0 },
     { X86::VPCMPEQQrr,        X86::VPCMPEQQrm,         0 },
     { X86::VPCMPEQWrr,        X86::VPCMPEQWrm,         0 },
     { X86::VPCMPGTBrr,        X86::VPCMPGTBrm,         0 },
     { X86::VPCMPGTDrr,        X86::VPCMPGTDrm,         0 },
     { X86::VPCMPGTQrr,        X86::VPCMPGTQrm,         0 },
     { X86::VPCMPGTWrr,        X86::VPCMPGTWrm,         0 },
     { X86::VPHADDDrr,         X86::VPHADDDrm,          0 },
     { X86::VPHADDSWrr128,     X86::VPHADDSWrm128,      0 },
     { X86::VPHADDWrr,         X86::VPHADDWrm,          0 },
     { X86::VPHSUBDrr,         X86::VPHSUBDrm,          0 },
     { X86::VPHSUBSWrr128,     X86::VPHSUBSWrm128,      0 },
     { X86::VPHSUBWrr,         X86::VPHSUBWrm,          0 },
     { X86::VPERMILPDrr,       X86::VPERMILPDrm,        0 },
     { X86::VPERMILPSrr,       X86::VPERMILPSrm,        0 },
     { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
     { X86::VPMADDUBSWrr128,   X86::VPMADDUBSWrm128,    0 },
     { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
     { X86::VPMAXSWrr,         X86::VPMAXSWrm,          0 },
     { X86::VPMAXUBrr,         X86::VPMAXUBrm,          0 },
     { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
     { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
     { X86::VPMINSBrr,         X86::VPMINSBrm,          0 },
     { X86::VPMINSDrr,         X86::VPMINSDrm,          0 },
     { X86::VPMINUDrr,         X86::VPMINUDrm,          0 },
     { X86::VPMINUWrr,         X86::VPMINUWrm,          0 },
     { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
     { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
     { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
     { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
     { X86::VPMULDQrr,         X86::VPMULDQrm,          0 },
     { X86::VPMULHRSWrr128,    X86::VPMULHRSWrm128,     0 },
     { X86::VPMULHUWrr,        X86::VPMULHUWrm,         0 },
     { X86::VPMULHWrr,         X86::VPMULHWrm,          0 },
     { X86::VPMULLDrr,         X86::VPMULLDrm,          0 },
     { X86::VPMULLWrr,         X86::VPMULLWrm,          0 },
     { X86::VPMULUDQrr,        X86::VPMULUDQrm,         0 },
     { X86::VPORrr,            X86::VPORrm,             0 },
     { X86::VPSADBWrr,         X86::VPSADBWrm,          0 },
     { X86::VPSHUFBrr,         X86::VPSHUFBrm,          0 },
     { X86::VPSIGNBrr,         X86::VPSIGNBrm,          0 },
     { X86::VPSIGNWrr,         X86::VPSIGNWrm,          0 },
     { X86::VPSIGNDrr,         X86::VPSIGNDrm,          0 },
     { X86::VPSLLDrr,          X86::VPSLLDrm,           0 },
     { X86::VPSLLQrr,          X86::VPSLLQrm,           0 },
     { X86::VPSLLWrr,          X86::VPSLLWrm,           0 },
     { X86::VPSRADrr,          X86::VPSRADrm,           0 },
     { X86::VPSRAWrr,          X86::VPSRAWrm,           0 },
     { X86::VPSRLDrr,          X86::VPSRLDrm,           0 },
     { X86::VPSRLQrr,          X86::VPSRLQrm,           0 },
     { X86::VPSRLWrr,          X86::VPSRLWrm,           0 },
     { X86::VPSUBBrr,          X86::VPSUBBrm,           0 },
     { X86::VPSUBDrr,          X86::VPSUBDrm,           0 },
     { X86::VPSUBSBrr,         X86::VPSUBSBrm,          0 },
     { X86::VPSUBSWrr,         X86::VPSUBSWrm,          0 },
     { X86::VPSUBWrr,          X86::VPSUBWrm,           0 },
     { X86::VPUNPCKHBWrr,      X86::VPUNPCKHBWrm,       0 },
     { X86::VPUNPCKHDQrr,      X86::VPUNPCKHDQrm,       0 },
     { X86::VPUNPCKHQDQrr,     X86::VPUNPCKHQDQrm,      0 },
     { X86::VPUNPCKHWDrr,      X86::VPUNPCKHWDrm,       0 },
     { X86::VPUNPCKLBWrr,      X86::VPUNPCKLBWrm,       0 },
     { X86::VPUNPCKLDQrr,      X86::VPUNPCKLDQrm,       0 },
     { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      0 },
     { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       0 },
     { X86::VPXORrr,           X86::VPXORrm,            0 },
     { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         0 },
     { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         0 },
     { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
     { X86::VSUBPSrr,          X86::VSUBPSrm,           0 },
     { X86::VSUBSDrr,          X86::VSUBSDrm,           0 },
     { X86::VSUBSSrr,          X86::VSUBSSrm,           0 },
     { X86::VUNPCKHPDrr,       X86::VUNPCKHPDrm,        0 },
     { X86::VUNPCKHPSrr,       X86::VUNPCKHPSrm,        0 },
     { X86::VUNPCKLPDrr,       X86::VUNPCKLPDrm,        0 },
     { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        0 },
     { X86::VXORPDrr,          X86::VXORPDrm,           0 },
     { X86::VXORPSrr,          X86::VXORPSrm,           0 },
     // AVX 256-bit foldable instructions
     { X86::VADDPDYrr,         X86::VADDPDYrm,          0 },
     { X86::VADDPSYrr,         X86::VADDPSYrm,          0 },
     { X86::VADDSUBPDYrr,      X86::VADDSUBPDYrm,       0 },
     { X86::VADDSUBPSYrr,      X86::VADDSUBPSYrm,       0 },
     { X86::VANDNPDYrr,        X86::VANDNPDYrm,         0 },
     { X86::VANDNPSYrr,        X86::VANDNPSYrm,         0 },
     { X86::VANDPDYrr,         X86::VANDPDYrm,          0 },
     { X86::VANDPSYrr,         X86::VANDPSYrm,          0 },
     { X86::VBLENDPDYrri,      X86::VBLENDPDYrmi,       0 },
     { X86::VBLENDPSYrri,      X86::VBLENDPSYrmi,       0 },
     { X86::VBLENDVPDYrr,      X86::VBLENDVPDYrm,       0 },
     { X86::VBLENDVPSYrr,      X86::VBLENDVPSYrm,       0 },
     { X86::VCMPPDYrri,        X86::VCMPPDYrmi,         0 },
     { X86::VCMPPSYrri,        X86::VCMPPSYrmi,         0 },
     { X86::VDIVPDYrr,         X86::VDIVPDYrm,          0 },
     { X86::VDIVPSYrr,         X86::VDIVPSYrm,          0 },
     { X86::VHADDPDYrr,        X86::VHADDPDYrm,         0 },
     { X86::VHADDPSYrr,        X86::VHADDPSYrm,         0 },
     { X86::VHSUBPDYrr,        X86::VHSUBPDYrm,         0 },
     { X86::VHSUBPSYrr,        X86::VHSUBPSYrm,         0 },
     { X86::VINSERTF128rr,     X86::VINSERTF128rm,      0 },
     { X86::VMAXPDYrr,         X86::VMAXPDYrm,          0 },
     { X86::VMAXPSYrr,         X86::VMAXPSYrm,          0 },
     { X86::VMINPDYrr,         X86::VMINPDYrm,          0 },
     { X86::VMINPSYrr,         X86::VMINPSYrm,          0 },
     { X86::VMULPDYrr,         X86::VMULPDYrm,          0 },
     { X86::VMULPSYrr,         X86::VMULPSYrm,          0 },
     { X86::VORPDYrr,          X86::VORPDYrm,           0 },
     { X86::VORPSYrr,          X86::VORPSYrm,           0 },
     { X86::VPERM2F128rr,      X86::VPERM2F128rm,       0 },
     { X86::VPERMILPDYrr,      X86::VPERMILPDYrm,       0 },
     { X86::VPERMILPSYrr,      X86::VPERMILPSYrm,       0 },
     { X86::VSHUFPDYrri,       X86::VSHUFPDYrmi,        0 },
     { X86::VSHUFPSYrri,       X86::VSHUFPSYrmi,        0 },
     { X86::VSUBPDYrr,         X86::VSUBPDYrm,          0 },
     { X86::VSUBPSYrr,         X86::VSUBPSYrm,          0 },
     { X86::VUNPCKHPDYrr,      X86::VUNPCKHPDYrm,       0 },
     { X86::VUNPCKHPSYrr,      X86::VUNPCKHPSYrm,       0 },
     { X86::VUNPCKLPDYrr,      X86::VUNPCKLPDYrm,       0 },
     { X86::VUNPCKLPSYrr,      X86::VUNPCKLPSYrm,       0 },
     { X86::VXORPDYrr,         X86::VXORPDYrm,          0 },
     { X86::VXORPSYrr,         X86::VXORPSYrm,          0 },
     // AVX2 foldable instructions
     { X86::VINSERTI128rr,     X86::VINSERTI128rm,      0 },
     { X86::VPACKSSDWYrr,      X86::VPACKSSDWYrm,       0 },
     { X86::VPACKSSWBYrr,      X86::VPACKSSWBYrm,       0 },
     { X86::VPACKUSDWYrr,      X86::VPACKUSDWYrm,       0 },
     { X86::VPACKUSWBYrr,      X86::VPACKUSWBYrm,       0 },
     { X86::VPADDBYrr,         X86::VPADDBYrm,          0 },
     { X86::VPADDDYrr,         X86::VPADDDYrm,          0 },
     { X86::VPADDQYrr,         X86::VPADDQYrm,          0 },
     { X86::VPADDSBYrr,        X86::VPADDSBYrm,         0 },
     { X86::VPADDSWYrr,        X86::VPADDSWYrm,         0 },
     { X86::VPADDUSBYrr,       X86::VPADDUSBYrm,        0 },
     { X86::VPADDUSWYrr,       X86::VPADDUSWYrm,        0 },
     { X86::VPADDWYrr,         X86::VPADDWYrm,          0 },
     { X86::VPALIGNR256rr,     X86::VPALIGNR256rm,      0 },
     { X86::VPANDNYrr,         X86::VPANDNYrm,          0 },
     { X86::VPANDYrr,          X86::VPANDYrm,           0 },
     { X86::VPAVGBYrr,         X86::VPAVGBYrm,          0 },
     { X86::VPAVGWYrr,         X86::VPAVGWYrm,          0 },
     { X86::VPBLENDDrri,       X86::VPBLENDDrmi,        0 },
     { X86::VPBLENDDYrri,      X86::VPBLENDDYrmi,       0 },
     { X86::VPBLENDWYrri,      X86::VPBLENDWYrmi,       0 },
     { X86::VPCMPEQBYrr,       X86::VPCMPEQBYrm,        0 },
     { X86::VPCMPEQDYrr,       X86::VPCMPEQDYrm,        0 },
     { X86::VPCMPEQQYrr,       X86::VPCMPEQQYrm,        0 },
     { X86::VPCMPEQWYrr,       X86::VPCMPEQWYrm,        0 },
     { X86::VPCMPGTBYrr,       X86::VPCMPGTBYrm,        0 },
     { X86::VPCMPGTDYrr,       X86::VPCMPGTDYrm,        0 },
     { X86::VPCMPGTQYrr,       X86::VPCMPGTQYrm,        0 },
     { X86::VPCMPGTWYrr,       X86::VPCMPGTWYrm,        0 },
     { X86::VPERM2I128rr,      X86::VPERM2I128rm,       0 },
     { X86::VPERMDYrr,         X86::VPERMDYrm,          0 },
     { X86::VPERMPDYri,        X86::VPERMPDYmi,         0 },
     { X86::VPERMPSYrr,        X86::VPERMPSYrm,         0 },
     { X86::VPERMQYri,         X86::VPERMQYmi,          0 },
     { X86::VPHADDDYrr,        X86::VPHADDDYrm,         0 },
     { X86::VPHADDSWrr256,     X86::VPHADDSWrm256,      0 },
     { X86::VPHADDWYrr,        X86::VPHADDWYrm,         0 },
     { X86::VPHSUBDYrr,        X86::VPHSUBDYrm,         0 },
     { X86::VPHSUBSWrr256,     X86::VPHSUBSWrm256,      0 },
     { X86::VPHSUBWYrr,        X86::VPHSUBWYrm,         0 },
     { X86::VPMADDUBSWrr256,   X86::VPMADDUBSWrm256,    0 },
     { X86::VPMADDWDYrr,       X86::VPMADDWDYrm,        0 },
     { X86::VPMAXSWYrr,        X86::VPMAXSWYrm,         0 },
     { X86::VPMAXUBYrr,        X86::VPMAXUBYrm,         0 },
     { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
     { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
     { X86::VPMINSBYrr,        X86::VPMINSBYrm,         0 },
     { X86::VPMINSDYrr,        X86::VPMINSDYrm,         0 },
     { X86::VPMINUDYrr,        X86::VPMINUDYrm,         0 },
     { X86::VPMINUWYrr,        X86::VPMINUWYrm,         0 },
     { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
     { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
     { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
     { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
     { X86::VMPSADBWYrri,      X86::VMPSADBWYrmi,       0 },
     { X86::VPMULDQYrr,        X86::VPMULDQYrm,         0 },
     { X86::VPMULHRSWrr256,    X86::VPMULHRSWrm256,     0 },
     { X86::VPMULHUWYrr,       X86::VPMULHUWYrm,        0 },
     { X86::VPMULHWYrr,        X86::VPMULHWYrm,         0 },
     { X86::VPMULLDYrr,        X86::VPMULLDYrm,         0 },
     { X86::VPMULLWYrr,        X86::VPMULLWYrm,         0 },
     { X86::VPMULUDQYrr,       X86::VPMULUDQYrm,        0 },
     { X86::VPORYrr,           X86::VPORYrm,            0 },
     { X86::VPSADBWYrr,        X86::VPSADBWYrm,         0 },
     { X86::VPSHUFBYrr,        X86::VPSHUFBYrm,         0 },
     { X86::VPSIGNBYrr,        X86::VPSIGNBYrm,         0 },
     { X86::VPSIGNWYrr,        X86::VPSIGNWYrm,         0 },
     { X86::VPSIGNDYrr,        X86::VPSIGNDYrm,         0 },
     { X86::VPSLLDYrr,         X86::VPSLLDYrm,          0 },
     { X86::VPSLLQYrr,         X86::VPSLLQYrm,          0 },
     { X86::VPSLLWYrr,         X86::VPSLLWYrm,          0 },
     { X86::VPSLLVDrr,         X86::VPSLLVDrm,          0 },
     { X86::VPSLLVDYrr,        X86::VPSLLVDYrm,         0 },
     { X86::VPSLLVQrr,         X86::VPSLLVQrm,          0 },
     { X86::VPSLLVQYrr,        X86::VPSLLVQYrm,         0 },
     { X86::VPSRADYrr,         X86::VPSRADYrm,          0 },
     { X86::VPSRAWYrr,         X86::VPSRAWYrm,          0 },
     { X86::VPSRAVDrr,         X86::VPSRAVDrm,          0 },
     { X86::VPSRAVDYrr,        X86::VPSRAVDYrm,         0 },
     { X86::VPSRLDYrr,         X86::VPSRLDYrm,          0 },
     { X86::VPSRLQYrr,         X86::VPSRLQYrm,          0 },
     { X86::VPSRLWYrr,         X86::VPSRLWYrm,          0 },
     { X86::VPSRLVDrr,         X86::VPSRLVDrm,          0 },
     { X86::VPSRLVDYrr,        X86::VPSRLVDYrm,         0 },
     { X86::VPSRLVQrr,         X86::VPSRLVQrm,          0 },
     { X86::VPSRLVQYrr,        X86::VPSRLVQYrm,         0 },
     { X86::VPSUBBYrr,         X86::VPSUBBYrm,          0 },
     { X86::VPSUBDYrr,         X86::VPSUBDYrm,          0 },
     { X86::VPSUBSBYrr,        X86::VPSUBSBYrm,         0 },
     { X86::VPSUBSWYrr,        X86::VPSUBSWYrm,         0 },
     { X86::VPSUBWYrr,         X86::VPSUBWYrm,          0 },
     { X86::VPUNPCKHBWYrr,     X86::VPUNPCKHBWYrm,      0 },
     { X86::VPUNPCKHDQYrr,     X86::VPUNPCKHDQYrm,      0 },
     { X86::VPUNPCKHQDQYrr,    X86::VPUNPCKHQDQYrm,     0 },
     { X86::VPUNPCKHWDYrr,     X86::VPUNPCKHWDYrm,      0 },
     { X86::VPUNPCKLBWYrr,     X86::VPUNPCKLBWYrm,      0 },
     { X86::VPUNPCKLDQYrr,     X86::VPUNPCKLDQYrm,      0 },
     { X86::VPUNPCKLQDQYrr,    X86::VPUNPCKLQDQYrm,     0 },
     { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      0 },
     { X86::VPXORYrr,          X86::VPXORYrm,           0 },
     // FIXME: add AVX 256-bit foldable instructions
 
     // FMA4 foldable patterns
     { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        0           },
     { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        0           },
     { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_16 },
     { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_16 },
     { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_32 },
     { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_32 },
     { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       0           },
     { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       0           },
     { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_16 },
     { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_16 },
     { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_32 },
     { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_32 },
     { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        0           },
     { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        0           },
     { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_16 },
     { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_16 },
     { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_32 },
     { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_32 },
     { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       0           },
     { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       0           },
     { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_16 },
     { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_16 },
     { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_32 },
     { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_32 },
     { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_16 },
     { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_16 },
     { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_32 },
     { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_32 },
     { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_16 },
     { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_16 },
     { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_32 },
     { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_32 },
 
     // BMI/BMI2 foldable instructions
     { X86::ANDN32rr,          X86::ANDN32rm,            0 },
     { X86::ANDN64rr,          X86::ANDN64rm,            0 },
     { X86::MULX32rr,          X86::MULX32rm,            0 },
     { X86::MULX64rr,          X86::MULX64rm,            0 },
     { X86::PDEP32rr,          X86::PDEP32rm,            0 },
     { X86::PDEP64rr,          X86::PDEP64rm,            0 },
     { X86::PEXT32rr,          X86::PEXT32rm,            0 },
     { X86::PEXT64rr,          X86::PEXT64rm,            0 },
 
     // AVX-512 foldable instructions
     { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
     { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
     { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
     { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
     { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
     { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
     { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
     { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
     { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
     { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
     { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
     { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
     { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
     { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
     { X86::VPERMPDZri,        X86::VPERMPDZmi,          0 },
     { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
     { X86::VPMAXSDZrr,        X86::VPMAXSDZrm,          0 },
     { X86::VPMAXSQZrr,        X86::VPMAXSQZrm,          0 },
     { X86::VPMAXUDZrr,        X86::VPMAXUDZrm,          0 },
     { X86::VPMAXUQZrr,        X86::VPMAXUQZrm,          0 },
     { X86::VPMINSDZrr,        X86::VPMINSDZrm,          0 },
     { X86::VPMINSQZrr,        X86::VPMINSQZrm,          0 },
     { X86::VPMINUDZrr,        X86::VPMINUDZrm,          0 },
     { X86::VPMINUQZrr,        X86::VPMINUQZrm,          0 },
     { X86::VPMULDQZrr,        X86::VPMULDQZrm,          0 },
     { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
     { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
     { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
     { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
     { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
     { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
     { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
     { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
     { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
     { X86::VALIGNQrri,        X86::VALIGNQrmi,          0 },
     { X86::VALIGNDrri,        X86::VALIGNDrmi,          0 },
     { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
     { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
     { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
 
     // AVX-512{F,VL} foldable instructions
     { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
     { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
     { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
 
     // AVX-512{F,VL} foldable instructions
     { X86::VADDPDZ128rr,      X86::VADDPDZ128rm,        0 },
     { X86::VADDPDZ256rr,      X86::VADDPDZ256rm,        0 },
     { X86::VADDPSZ128rr,      X86::VADDPSZ128rm,        0 },
     { X86::VADDPSZ256rr,      X86::VADDPSZ256rm,        0 },
 
     // AES foldable instructions
     { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
     { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
     { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
     { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
     { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       TB_ALIGN_16 },
     { X86::VAESDECrr,         X86::VAESDECrm,           TB_ALIGN_16 },
     { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       TB_ALIGN_16 },
     { X86::VAESENCrr,         X86::VAESENCrm,           TB_ALIGN_16 },
 
     // SHA foldable instructions
     { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
     { X86::SHA1MSG2rr,        X86::SHA1MSG2rm,          TB_ALIGN_16 },
     { X86::SHA1NEXTErr,       X86::SHA1NEXTErm,         TB_ALIGN_16 },
     { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
     { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
     { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
     { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
     unsigned RegOp = OpTbl2[i].RegOp;
     unsigned MemOp = OpTbl2[i].MemOp;
     unsigned Flags = OpTbl2[i].Flags;
     AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 2, folded load
                   Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
   }
 
   static const X86OpTblEntry OpTbl3[] = {
     // FMA foldable instructions
     { X86::VFMADDSSr231r,         X86::VFMADDSSr231m,         TB_ALIGN_NONE },
     { X86::VFMADDSDr231r,         X86::VFMADDSDr231m,         TB_ALIGN_NONE },
     { X86::VFMADDSSr132r,         X86::VFMADDSSr132m,         TB_ALIGN_NONE },
     { X86::VFMADDSDr132r,         X86::VFMADDSDr132m,         TB_ALIGN_NONE },
     { X86::VFMADDSSr213r,         X86::VFMADDSSr213m,         TB_ALIGN_NONE },
     { X86::VFMADDSDr213r,         X86::VFMADDSDr213m,         TB_ALIGN_NONE },
 
     { X86::VFMADDPSr231r,         X86::VFMADDPSr231m,         TB_ALIGN_NONE },
     { X86::VFMADDPDr231r,         X86::VFMADDPDr231m,         TB_ALIGN_NONE },
     { X86::VFMADDPSr132r,         X86::VFMADDPSr132m,         TB_ALIGN_NONE },
     { X86::VFMADDPDr132r,         X86::VFMADDPDr132m,         TB_ALIGN_NONE },
     { X86::VFMADDPSr213r,         X86::VFMADDPSr213m,         TB_ALIGN_NONE },
     { X86::VFMADDPDr213r,         X86::VFMADDPDr213m,         TB_ALIGN_NONE },
     { X86::VFMADDPSr231rY,        X86::VFMADDPSr231mY,        TB_ALIGN_NONE },
     { X86::VFMADDPDr231rY,        X86::VFMADDPDr231mY,        TB_ALIGN_NONE },
     { X86::VFMADDPSr132rY,        X86::VFMADDPSr132mY,        TB_ALIGN_NONE },
     { X86::VFMADDPDr132rY,        X86::VFMADDPDr132mY,        TB_ALIGN_NONE },
     { X86::VFMADDPSr213rY,        X86::VFMADDPSr213mY,        TB_ALIGN_NONE },
     { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_NONE },
 
     { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        TB_ALIGN_NONE },
     { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        TB_ALIGN_NONE },
     { X86::VFNMADDSSr132r,        X86::VFNMADDSSr132m,        TB_ALIGN_NONE },
     { X86::VFNMADDSDr132r,        X86::VFNMADDSDr132m,        TB_ALIGN_NONE },
     { X86::VFNMADDSSr213r,        X86::VFNMADDSSr213m,        TB_ALIGN_NONE },
     { X86::VFNMADDSDr213r,        X86::VFNMADDSDr213m,        TB_ALIGN_NONE },
 
     { X86::VFNMADDPSr231r,        X86::VFNMADDPSr231m,        TB_ALIGN_NONE },
     { X86::VFNMADDPDr231r,        X86::VFNMADDPDr231m,        TB_ALIGN_NONE },
     { X86::VFNMADDPSr132r,        X86::VFNMADDPSr132m,        TB_ALIGN_NONE },
     { X86::VFNMADDPDr132r,        X86::VFNMADDPDr132m,        TB_ALIGN_NONE },
     { X86::VFNMADDPSr213r,        X86::VFNMADDPSr213m,        TB_ALIGN_NONE },
     { X86::VFNMADDPDr213r,        X86::VFNMADDPDr213m,        TB_ALIGN_NONE },
     { X86::VFNMADDPSr231rY,       X86::VFNMADDPSr231mY,       TB_ALIGN_NONE },
     { X86::VFNMADDPDr231rY,       X86::VFNMADDPDr231mY,       TB_ALIGN_NONE },
     { X86::VFNMADDPSr132rY,       X86::VFNMADDPSr132mY,       TB_ALIGN_NONE },
     { X86::VFNMADDPDr132rY,       X86::VFNMADDPDr132mY,       TB_ALIGN_NONE },
     { X86::VFNMADDPSr213rY,       X86::VFNMADDPSr213mY,       TB_ALIGN_NONE },
     { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_NONE },
 
     { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         TB_ALIGN_NONE },
     { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         TB_ALIGN_NONE },
     { X86::VFMSUBSSr132r,         X86::VFMSUBSSr132m,         TB_ALIGN_NONE },
     { X86::VFMSUBSDr132r,         X86::VFMSUBSDr132m,         TB_ALIGN_NONE },
     { X86::VFMSUBSSr213r,         X86::VFMSUBSSr213m,         TB_ALIGN_NONE },
     { X86::VFMSUBSDr213r,         X86::VFMSUBSDr213m,         TB_ALIGN_NONE },
 
     { X86::VFMSUBPSr231r,         X86::VFMSUBPSr231m,         TB_ALIGN_NONE },
     { X86::VFMSUBPDr231r,         X86::VFMSUBPDr231m,         TB_ALIGN_NONE },
     { X86::VFMSUBPSr132r,         X86::VFMSUBPSr132m,         TB_ALIGN_NONE },
     { X86::VFMSUBPDr132r,         X86::VFMSUBPDr132m,         TB_ALIGN_NONE },
     { X86::VFMSUBPSr213r,         X86::VFMSUBPSr213m,         TB_ALIGN_NONE },
     { X86::VFMSUBPDr213r,         X86::VFMSUBPDr213m,         TB_ALIGN_NONE },
     { X86::VFMSUBPSr231rY,        X86::VFMSUBPSr231mY,        TB_ALIGN_NONE },
     { X86::VFMSUBPDr231rY,        X86::VFMSUBPDr231mY,        TB_ALIGN_NONE },
     { X86::VFMSUBPSr132rY,        X86::VFMSUBPSr132mY,        TB_ALIGN_NONE },
     { X86::VFMSUBPDr132rY,        X86::VFMSUBPDr132mY,        TB_ALIGN_NONE },
     { X86::VFMSUBPSr213rY,        X86::VFMSUBPSr213mY,        TB_ALIGN_NONE },
     { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_NONE },
 
     { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        TB_ALIGN_NONE },
     { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        TB_ALIGN_NONE },
     { X86::VFNMSUBSSr132r,        X86::VFNMSUBSSr132m,        TB_ALIGN_NONE },
     { X86::VFNMSUBSDr132r,        X86::VFNMSUBSDr132m,        TB_ALIGN_NONE },
     { X86::VFNMSUBSSr213r,        X86::VFNMSUBSSr213m,        TB_ALIGN_NONE },
     { X86::VFNMSUBSDr213r,        X86::VFNMSUBSDr213m,        TB_ALIGN_NONE },
 
     { X86::VFNMSUBPSr231r,        X86::VFNMSUBPSr231m,        TB_ALIGN_NONE },
     { X86::VFNMSUBPDr231r,        X86::VFNMSUBPDr231m,        TB_ALIGN_NONE },
     { X86::VFNMSUBPSr132r,        X86::VFNMSUBPSr132m,        TB_ALIGN_NONE },
     { X86::VFNMSUBPDr132r,        X86::VFNMSUBPDr132m,        TB_ALIGN_NONE },
     { X86::VFNMSUBPSr213r,        X86::VFNMSUBPSr213m,        TB_ALIGN_NONE },
     { X86::VFNMSUBPDr213r,        X86::VFNMSUBPDr213m,        TB_ALIGN_NONE },
     { X86::VFNMSUBPSr231rY,       X86::VFNMSUBPSr231mY,       TB_ALIGN_NONE },
     { X86::VFNMSUBPDr231rY,       X86::VFNMSUBPDr231mY,       TB_ALIGN_NONE },
     { X86::VFNMSUBPSr132rY,       X86::VFNMSUBPSr132mY,       TB_ALIGN_NONE },
     { X86::VFNMSUBPDr132rY,       X86::VFNMSUBPDr132mY,       TB_ALIGN_NONE },
     { X86::VFNMSUBPSr213rY,       X86::VFNMSUBPSr213mY,       TB_ALIGN_NONE },
     { X86::VFNMSUBPDr213rY,       X86::VFNMSUBPDr213mY,       TB_ALIGN_NONE },
 
     { X86::VFMADDSUBPSr231r,      X86::VFMADDSUBPSr231m,      TB_ALIGN_NONE },
     { X86::VFMADDSUBPDr231r,      X86::VFMADDSUBPDr231m,      TB_ALIGN_NONE },
     { X86::VFMADDSUBPSr132r,      X86::VFMADDSUBPSr132m,      TB_ALIGN_NONE },
     { X86::VFMADDSUBPDr132r,      X86::VFMADDSUBPDr132m,      TB_ALIGN_NONE },
     { X86::VFMADDSUBPSr213r,      X86::VFMADDSUBPSr213m,      TB_ALIGN_NONE },
     { X86::VFMADDSUBPDr213r,      X86::VFMADDSUBPDr213m,      TB_ALIGN_NONE },
     { X86::VFMADDSUBPSr231rY,     X86::VFMADDSUBPSr231mY,     TB_ALIGN_NONE },
     { X86::VFMADDSUBPDr231rY,     X86::VFMADDSUBPDr231mY,     TB_ALIGN_NONE },
     { X86::VFMADDSUBPSr132rY,     X86::VFMADDSUBPSr132mY,     TB_ALIGN_NONE },
     { X86::VFMADDSUBPDr132rY,     X86::VFMADDSUBPDr132mY,     TB_ALIGN_NONE },
     { X86::VFMADDSUBPSr213rY,     X86::VFMADDSUBPSr213mY,     TB_ALIGN_NONE },
     { X86::VFMADDSUBPDr213rY,     X86::VFMADDSUBPDr213mY,     TB_ALIGN_NONE },
 
     { X86::VFMSUBADDPSr231r,      X86::VFMSUBADDPSr231m,      TB_ALIGN_NONE },
     { X86::VFMSUBADDPDr231r,      X86::VFMSUBADDPDr231m,      TB_ALIGN_NONE },
     { X86::VFMSUBADDPSr132r,      X86::VFMSUBADDPSr132m,      TB_ALIGN_NONE },
     { X86::VFMSUBADDPDr132r,      X86::VFMSUBADDPDr132m,      TB_ALIGN_NONE },
     { X86::VFMSUBADDPSr213r,      X86::VFMSUBADDPSr213m,      TB_ALIGN_NONE },
     { X86::VFMSUBADDPDr213r,      X86::VFMSUBADDPDr213m,      TB_ALIGN_NONE },
     { X86::VFMSUBADDPSr231rY,     X86::VFMSUBADDPSr231mY,     TB_ALIGN_NONE },
     { X86::VFMSUBADDPDr231rY,     X86::VFMSUBADDPDr231mY,     TB_ALIGN_NONE },
     { X86::VFMSUBADDPSr132rY,     X86::VFMSUBADDPSr132mY,     TB_ALIGN_NONE },
     { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_NONE },
     { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_NONE },
     { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_NONE },
 
     // FMA4 foldable patterns
     { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           0           },
     { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           0           },
     { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_16 },
     { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_16 },
     { X86::VFMADDPS4rrY,          X86::VFMADDPS4rmY,          TB_ALIGN_32 },
     { X86::VFMADDPD4rrY,          X86::VFMADDPD4rmY,          TB_ALIGN_32 },
     { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          0           },
     { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          0           },
     { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_16 },
     { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_16 },
     { X86::VFNMADDPS4rrY,         X86::VFNMADDPS4rmY,         TB_ALIGN_32 },
     { X86::VFNMADDPD4rrY,         X86::VFNMADDPD4rmY,         TB_ALIGN_32 },
     { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           0           },
     { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           0           },
     { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_16 },
     { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_16 },
     { X86::VFMSUBPS4rrY,          X86::VFMSUBPS4rmY,          TB_ALIGN_32 },
     { X86::VFMSUBPD4rrY,          X86::VFMSUBPD4rmY,          TB_ALIGN_32 },
     { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          0           },
     { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          0           },
     { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_16 },
     { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_16 },
     { X86::VFNMSUBPS4rrY,         X86::VFNMSUBPS4rmY,         TB_ALIGN_32 },
     { X86::VFNMSUBPD4rrY,         X86::VFNMSUBPD4rmY,         TB_ALIGN_32 },
     { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_16 },
     { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_16 },
     { X86::VFMADDSUBPS4rrY,       X86::VFMADDSUBPS4rmY,       TB_ALIGN_32 },
     { X86::VFMADDSUBPD4rrY,       X86::VFMADDSUBPD4rmY,       TB_ALIGN_32 },
     { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_16 },
     { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
     { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
     { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
     // AVX-512 VPERMI instructions with 3 source operands.
     { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
     { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
     { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
     { X86::VPERMI2PDrr,           X86::VPERMI2PDrm,           0 },
     { X86::VBLENDMPDZrr,          X86::VBLENDMPDZrm,          0 },
     { X86::VBLENDMPSZrr,          X86::VBLENDMPSZrm,          0 },
     { X86::VPBLENDMDZrr,          X86::VPBLENDMDZrm,          0 },
     { X86::VPBLENDMQZrr,          X86::VPBLENDMQZrm,          0 },
     { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
     { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
     { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
     { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
     { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
      // AVX-512 arithmetic instructions
     { X86::VADDPSZrrkz,           X86::VADDPSZrmkz,           0 },
     { X86::VADDPDZrrkz,           X86::VADDPDZrmkz,           0 },
     { X86::VSUBPSZrrkz,           X86::VSUBPSZrmkz,           0 },
     { X86::VSUBPDZrrkz,           X86::VSUBPDZrmkz,           0 },
     { X86::VMULPSZrrkz,           X86::VMULPSZrmkz,           0 },
     { X86::VMULPDZrrkz,           X86::VMULPDZrmkz,           0 },
     { X86::VDIVPSZrrkz,           X86::VDIVPSZrmkz,           0 },
     { X86::VDIVPDZrrkz,           X86::VDIVPDZrmkz,           0 },
     { X86::VMINPSZrrkz,           X86::VMINPSZrmkz,           0 },
     { X86::VMINPDZrrkz,           X86::VMINPDZrmkz,           0 },
     { X86::VMAXPSZrrkz,           X86::VMAXPSZrmkz,           0 },
     { X86::VMAXPDZrrkz,           X86::VMAXPDZrmkz,           0 },
     // AVX-512{F,VL} arithmetic instructions 256-bit
     { X86::VADDPSZ256rrkz,        X86::VADDPSZ256rmkz,        0 },
     { X86::VADDPDZ256rrkz,        X86::VADDPDZ256rmkz,        0 },
     { X86::VSUBPSZ256rrkz,        X86::VSUBPSZ256rmkz,        0 },
     { X86::VSUBPDZ256rrkz,        X86::VSUBPDZ256rmkz,        0 },
     { X86::VMULPSZ256rrkz,        X86::VMULPSZ256rmkz,        0 },
     { X86::VMULPDZ256rrkz,        X86::VMULPDZ256rmkz,        0 },
     { X86::VDIVPSZ256rrkz,        X86::VDIVPSZ256rmkz,        0 },
     { X86::VDIVPDZ256rrkz,        X86::VDIVPDZ256rmkz,        0 },
     { X86::VMINPSZ256rrkz,        X86::VMINPSZ256rmkz,        0 },
     { X86::VMINPDZ256rrkz,        X86::VMINPDZ256rmkz,        0 },
     { X86::VMAXPSZ256rrkz,        X86::VMAXPSZ256rmkz,        0 },
     { X86::VMAXPDZ256rrkz,        X86::VMAXPDZ256rmkz,        0 },
     // AVX-512{F,VL} arithmetic instructions 128-bit
     { X86::VADDPSZ128rrkz,        X86::VADDPSZ128rmkz,        0 },
     { X86::VADDPDZ128rrkz,        X86::VADDPDZ128rmkz,        0 },
     { X86::VSUBPSZ128rrkz,        X86::VSUBPSZ128rmkz,        0 },
     { X86::VSUBPDZ128rrkz,        X86::VSUBPDZ128rmkz,        0 },
     { X86::VMULPSZ128rrkz,        X86::VMULPSZ128rmkz,        0 },
     { X86::VMULPDZ128rrkz,        X86::VMULPDZ128rmkz,        0 },
     { X86::VDIVPSZ128rrkz,        X86::VDIVPSZ128rmkz,        0 },
     { X86::VDIVPDZ128rrkz,        X86::VDIVPDZ128rmkz,        0 },
     { X86::VMINPSZ128rrkz,        X86::VMINPSZ128rmkz,        0 },
     { X86::VMINPDZ128rrkz,        X86::VMINPDZ128rmkz,        0 },
     { X86::VMAXPSZ128rrkz,        X86::VMAXPSZ128rmkz,        0 },
     { X86::VMAXPDZ128rrkz,        X86::VMAXPDZ128rmkz,        0 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
     unsigned RegOp = OpTbl3[i].RegOp;
     unsigned MemOp = OpTbl3[i].MemOp;
     unsigned Flags = OpTbl3[i].Flags;
     AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 3, folded load
                   Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
   }
 
   static const X86OpTblEntry OpTbl4[] = {
      // AVX-512 foldable instructions
     { X86::VADDPSZrrk,         X86::VADDPSZrmk,           0 },
     { X86::VADDPDZrrk,         X86::VADDPDZrmk,           0 },
     { X86::VSUBPSZrrk,         X86::VSUBPSZrmk,           0 },
     { X86::VSUBPDZrrk,         X86::VSUBPDZrmk,           0 },
     { X86::VMULPSZrrk,         X86::VMULPSZrmk,           0 },
     { X86::VMULPDZrrk,         X86::VMULPDZrmk,           0 },
     { X86::VDIVPSZrrk,         X86::VDIVPSZrmk,           0 },
     { X86::VDIVPDZrrk,         X86::VDIVPDZrmk,           0 },
     { X86::VMINPSZrrk,         X86::VMINPSZrmk,           0 },
     { X86::VMINPDZrrk,         X86::VMINPDZrmk,           0 },
     { X86::VMAXPSZrrk,         X86::VMAXPSZrmk,           0 },
     { X86::VMAXPDZrrk,         X86::VMAXPDZrmk,           0 },
     // AVX-512{F,VL} foldable instructions 256-bit
     { X86::VADDPSZ256rrk,      X86::VADDPSZ256rmk,        0 },
     { X86::VADDPDZ256rrk,      X86::VADDPDZ256rmk,        0 },
     { X86::VSUBPSZ256rrk,      X86::VSUBPSZ256rmk,        0 },
     { X86::VSUBPDZ256rrk,      X86::VSUBPDZ256rmk,        0 },
     { X86::VMULPSZ256rrk,      X86::VMULPSZ256rmk,        0 },
     { X86::VMULPDZ256rrk,      X86::VMULPDZ256rmk,        0 },
     { X86::VDIVPSZ256rrk,      X86::VDIVPSZ256rmk,        0 },
     { X86::VDIVPDZ256rrk,      X86::VDIVPDZ256rmk,        0 },
     { X86::VMINPSZ256rrk,      X86::VMINPSZ256rmk,        0 },
     { X86::VMINPDZ256rrk,      X86::VMINPDZ256rmk,        0 },
     { X86::VMAXPSZ256rrk,      X86::VMAXPSZ256rmk,        0 },
     { X86::VMAXPDZ256rrk,      X86::VMAXPDZ256rmk,        0 },
     // AVX-512{F,VL} foldable instructions 128-bit
     { X86::VADDPSZ128rrk,      X86::VADDPSZ128rmk,        0 },
     { X86::VADDPDZ128rrk,      X86::VADDPDZ128rmk,        0 },
     { X86::VSUBPSZ128rrk,      X86::VSUBPSZ128rmk,        0 },
     { X86::VSUBPDZ128rrk,      X86::VSUBPDZ128rmk,        0 },
     { X86::VMULPSZ128rrk,      X86::VMULPSZ128rmk,        0 },
     { X86::VMULPDZ128rrk,      X86::VMULPDZ128rmk,        0 },
     { X86::VDIVPSZ128rrk,      X86::VDIVPSZ128rmk,        0 },
     { X86::VDIVPDZ128rrk,      X86::VDIVPDZ128rmk,        0 },
     { X86::VMINPSZ128rrk,      X86::VMINPSZ128rmk,        0 },
     { X86::VMINPDZ128rrk,      X86::VMINPDZ128rmk,        0 },
     { X86::VMAXPSZ128rrk,      X86::VMAXPSZ128rmk,        0 },
     { X86::VMAXPDZ128rrk,      X86::VMAXPDZ128rmk,        0 }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl4); i != e; ++i) {
     unsigned RegOp = OpTbl4[i].RegOp;
     unsigned MemOp = OpTbl4[i].MemOp;
     unsigned Flags = OpTbl4[i].Flags;
     AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
                   RegOp, MemOp,
                   // Index 4, folded load
                   Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
   }
 }
 
 void
 X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
                             MemOp2RegOpTableType &M2RTable,
                             unsigned RegOp, unsigned MemOp, unsigned Flags) {
     if ((Flags & TB_NO_FORWARD) == 0) {
       assert(!R2MTable.count(RegOp) && "Duplicate entry!");
       R2MTable[RegOp] = std::make_pair(MemOp, Flags);
     }
     if ((Flags & TB_NO_REVERSE) == 0) {
       assert(!M2RTable.count(MemOp) &&
            "Duplicated entries in unfolding maps?");
       M2RTable[MemOp] = std::make_pair(RegOp, Flags);
     }
 }
 
 bool
 X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                     unsigned &SrcReg, unsigned &DstReg,
                                     unsigned &SubIdx) const {
   switch (MI.getOpcode()) {
   default: break;
   case X86::MOVSX16rr8:
   case X86::MOVZX16rr8:
   case X86::MOVSX32rr8:
   case X86::MOVZX32rr8:
   case X86::MOVSX64rr8:
     if (!Subtarget.is64Bit())
       // It's not always legal to reference the low 8-bit of the larger
       // register in 32-bit mode.
       return false;
   case X86::MOVSX32rr16:
   case X86::MOVZX32rr16:
   case X86::MOVSX64rr16:
   case X86::MOVSX64rr32: {
     if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
       // Be conservative.
       return false;
     SrcReg = MI.getOperand(1).getReg();
     DstReg = MI.getOperand(0).getReg();
     switch (MI.getOpcode()) {
     default: llvm_unreachable("Unreachable!");
     case X86::MOVSX16rr8:
     case X86::MOVZX16rr8:
     case X86::MOVSX32rr8:
     case X86::MOVZX32rr8:
     case X86::MOVSX64rr8:
       SubIdx = X86::sub_8bit;
       break;
     case X86::MOVSX32rr16:
     case X86::MOVZX32rr16:
     case X86::MOVSX64rr16:
       SubIdx = X86::sub_16bit;
       break;
     case X86::MOVSX64rr32:
       SubIdx = X86::sub_32bit;
       break;
     }
     return true;
   }
   }
   return false;
 }
 
+int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+
+  if (MI->getOpcode() == getCallFrameSetupOpcode() ||
+      MI->getOpcode() == getCallFrameDestroyOpcode()) {
+    unsigned StackAlign = TFI->getStackAlignment();
+    int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * 
+                 StackAlign;
+
+    SPAdj -= MI->getOperand(1).getImm();
+
+    if (MI->getOpcode() == getCallFrameSetupOpcode())
+      return SPAdj;
+    else
+      return -SPAdj;
+  }
+  
+  // To know whether a call adjusts the stack, we need information 
+  // that is bound to the following ADJCALLSTACKUP pseudo.
+  // Look for the next ADJCALLSTACKUP that follows the call.
+  if (MI->isCall()) {
+    const MachineBasicBlock* MBB = MI->getParent();
+    auto I = ++MachineBasicBlock::const_iterator(MI);
+    for (auto E = MBB->end(); I != E; ++I) {
+      if (I->getOpcode() == getCallFrameDestroyOpcode() ||
+          I->isCall())
+        break;
+    }
+
+    // If we could not find a frame destroy opcode, then it has already
+    // been simplified, so we don't care.
+    if (I->getOpcode() != getCallFrameDestroyOpcode())
+      return 0;
+
+    return -(I->getOperand(1).getImm());
+  }
+
+  // Currently handle only PUSHes we can reasonably expect to see
+  // in call sequences
+  switch (MI->getOpcode()) {
+  default: 
+    return 0;
+  case X86::PUSH32i8:
+  case X86::PUSH32r:
+  case X86::PUSH32rmm:
+  case X86::PUSH32rmr:
+  case X86::PUSHi32:
+    return 4;
+  }
+}
+
 /// isFrameOperand - Return true and the FrameIndex if the specified
 /// operand and follow operands form a reference to the stack frame.
 bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
                                   int &FrameIndex) const {
   if (MI->getOperand(Op+X86::AddrBaseReg).isFI() &&
       MI->getOperand(Op+X86::AddrScaleAmt).isImm() &&
       MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
       MI->getOperand(Op+X86::AddrDisp).isImm() &&
       MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 &&
       MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 &&
       MI->getOperand(Op+X86::AddrDisp).getImm() == 0) {
     FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex();
     return true;
   }
   return false;
 }
 
 static bool isFrameLoadOpcode(int Opcode) {
   switch (Opcode) {
   default:
     return false;
   case X86::MOV8rm:
   case X86::MOV16rm:
   case X86::MOV32rm:
   case X86::MOV64rm:
   case X86::LD_Fp64m:
   case X86::MOVSSrm:
   case X86::MOVSDrm:
   case X86::MOVAPSrm:
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::VMOVSSrm:
   case X86::VMOVSDrm:
   case X86::VMOVAPSrm:
   case X86::VMOVAPDrm:
   case X86::VMOVDQArm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPDYrm:
   case X86::VMOVAPDYrm:
   case X86::VMOVDQUYrm:
   case X86::VMOVDQAYrm:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
   case X86::VMOVAPSZrm:
   case X86::VMOVUPSZrm:
     return true;
   }
 }
 
 static bool isFrameStoreOpcode(int Opcode) {
   switch (Opcode) {
   default: break;
   case X86::MOV8mr:
   case X86::MOV16mr:
   case X86::MOV32mr:
   case X86::MOV64mr:
   case X86::ST_FpP64m:
   case X86::MOVSSmr:
   case X86::MOVSDmr:
   case X86::MOVAPSmr:
   case X86::MOVAPDmr:
   case X86::MOVDQAmr:
   case X86::VMOVSSmr:
   case X86::VMOVSDmr:
   case X86::VMOVAPSmr:
   case X86::VMOVAPDmr:
   case X86::VMOVDQAmr:
   case X86::VMOVUPSYmr:
   case X86::VMOVAPSYmr:
   case X86::VMOVUPDYmr:
   case X86::VMOVAPDYmr:
   case X86::VMOVDQUYmr:
   case X86::VMOVDQAYmr:
   case X86::VMOVUPSZmr:
   case X86::VMOVAPSZmr:
   case X86::MMX_MOVD64mr:
   case X86::MMX_MOVQ64mr:
   case X86::MMX_MOVNTQmr:
     return true;
   }
   return false;
 }
 
 unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                            int &FrameIndex) const {
   if (isFrameLoadOpcode(MI->getOpcode()))
     if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
       return MI->getOperand(0).getReg();
   return 0;
 }
 
 unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
                                                  int &FrameIndex) const {
   if (isFrameLoadOpcode(MI->getOpcode())) {
     unsigned Reg;
     if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
       return Reg;
     // Check for post-frame index elimination operations
     const MachineMemOperand *Dummy;
     return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
   }
   return 0;
 }
 
 unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                           int &FrameIndex) const {
   if (isFrameStoreOpcode(MI->getOpcode()))
     if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
         isFrameOperand(MI, 0, FrameIndex))
       return MI->getOperand(X86::AddrNumOperands).getReg();
   return 0;
 }
 
 unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
                                                 int &FrameIndex) const {
   if (isFrameStoreOpcode(MI->getOpcode())) {
     unsigned Reg;
     if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
       return Reg;
     // Check for post-frame index elimination operations
     const MachineMemOperand *Dummy;
     return hasStoreToStackSlot(MI, Dummy, FrameIndex);
   }
   return 0;
 }
 
 /// regIsPICBase - Return true if register is PIC base (i.e.g defined by
 /// X86::MOVPC32r.
 static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
   // Don't waste compile time scanning use-def chains of physregs.
   if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
     return false;
   bool isPICBase = false;
   for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
          E = MRI.def_instr_end(); I != E; ++I) {
     MachineInstr *DefMI = &*I;
     if (DefMI->getOpcode() != X86::MOVPC32r)
       return false;
     assert(!isPICBase && "More than one PIC base?");
     isPICBase = true;
   }
   return isPICBase;
 }
 
 bool
 X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
                                                 AliasAnalysis *AA) const {
   switch (MI->getOpcode()) {
   default: break;
   case X86::MOV8rm:
   case X86::MOV16rm:
   case X86::MOV32rm:
   case X86::MOV64rm:
   case X86::LD_Fp64m:
   case X86::MOVSSrm:
   case X86::MOVSDrm:
   case X86::MOVAPSrm:
   case X86::MOVUPSrm:
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
   case X86::VMOVSSrm:
   case X86::VMOVSDrm:
   case X86::VMOVAPSrm:
   case X86::VMOVUPSrm:
   case X86::VMOVAPDrm:
   case X86::VMOVDQArm:
   case X86::VMOVDQUrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPDYrm:
   case X86::VMOVDQAYrm:
   case X86::VMOVDQUYrm:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
   case X86::FsVMOVAPSrm:
   case X86::FsVMOVAPDrm:
   case X86::FsMOVAPSrm:
   case X86::FsMOVAPDrm: {
     // Loads from constant pools are trivially rematerializable.
     if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
         MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
         MI->getOperand(1+X86::AddrIndexReg).isReg() &&
         MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
         MI->isInvariantLoad(AA)) {
       unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
       if (BaseReg == 0 || BaseReg == X86::RIP)
         return true;
       // Allow re-materialization of PIC load.
       if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal())
         return false;
       const MachineFunction &MF = *MI->getParent()->getParent();
       const MachineRegisterInfo &MRI = MF.getRegInfo();
       return regIsPICBase(BaseReg, MRI);
     }
     return false;
   }
 
   case X86::LEA32r:
   case X86::LEA64r: {
     if (MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
         MI->getOperand(1+X86::AddrIndexReg).isReg() &&
         MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
         !MI->getOperand(1+X86::AddrDisp).isReg()) {
       // lea fi#, lea GV, etc. are all rematerializable.
       if (!MI->getOperand(1+X86::AddrBaseReg).isReg())
         return true;
       unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
       if (BaseReg == 0)
         return true;
       // Allow re-materialization of lea PICBase + x.
       const MachineFunction &MF = *MI->getParent()->getParent();
       const MachineRegisterInfo &MRI = MF.getRegInfo();
       return regIsPICBase(BaseReg, MRI);
     }
     return false;
   }
   }
 
   // All other instructions marked M_REMATERIALIZABLE are always trivially
   // rematerializable.
   return true;
 }
 
 bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I) const {
   MachineBasicBlock::iterator E = MBB.end();
 
   // For compile time consideration, if we are not able to determine the
   // safety after visiting 4 instructions in each direction, we will assume
   // it's not safe.
   MachineBasicBlock::iterator Iter = I;
   for (unsigned i = 0; Iter != E && i < 4; ++i) {
     bool SeenDef = false;
     for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
       MachineOperand &MO = Iter->getOperand(j);
       if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
         SeenDef = true;
       if (!MO.isReg())
         continue;
       if (MO.getReg() == X86::EFLAGS) {
         if (MO.isUse())
           return false;
         SeenDef = true;
       }
     }
 
     if (SeenDef)
       // This instruction defines EFLAGS, no need to look any further.
       return true;
     ++Iter;
     // Skip over DBG_VALUE.
     while (Iter != E && Iter->isDebugValue())
       ++Iter;
   }
 
   // It is safe to clobber EFLAGS at the end of a block of no successor has it
   // live in.
   if (Iter == E) {
     for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
            SE = MBB.succ_end(); SI != SE; ++SI)
       if ((*SI)->isLiveIn(X86::EFLAGS))
         return false;
     return true;
   }
 
   MachineBasicBlock::iterator B = MBB.begin();
   Iter = I;
   for (unsigned i = 0; i < 4; ++i) {
     // If we make it to the beginning of the block, it's safe to clobber
     // EFLAGS iff EFLAGS is not live-in.
     if (Iter == B)
       return !MBB.isLiveIn(X86::EFLAGS);
 
     --Iter;
     // Skip over DBG_VALUE.
     while (Iter != B && Iter->isDebugValue())
       --Iter;
 
     bool SawKill = false;
     for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
       MachineOperand &MO = Iter->getOperand(j);
       // A register mask may clobber EFLAGS, but we should still look for a
       // live EFLAGS def.
       if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
         SawKill = true;
       if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
         if (MO.isDef()) return MO.isDead();
         if (MO.isKill()) SawKill = true;
       }
     }
 
     if (SawKill)
       // This instruction kills EFLAGS and doesn't redefine it, so
       // there's no need to look further.
       return true;
   }
 
   // Conservative answer.
   return false;
 }
 
 void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
                                  unsigned DestReg, unsigned SubIdx,
                                  const MachineInstr *Orig,
                                  const TargetRegisterInfo &TRI) const {
   // MOV32r0 is implemented with a xor which clobbers condition code.
   // Re-materialize it as movri instructions to avoid side effects.
   unsigned Opc = Orig->getOpcode();
   if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) {
     DebugLoc DL = Orig->getDebugLoc();
     BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
       .addImm(0);
   } else {
     MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
     MBB.insert(I, MI);
   }
 
   MachineInstr *NewMI = std::prev(I);
   NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI);
 }
 
 /// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
 /// is not marked dead.
 static bool hasLiveCondCodeDef(MachineInstr *MI) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
     if (MO.isReg() && MO.isDef() &&
         MO.getReg() == X86::EFLAGS && !MO.isDead()) {
       return true;
     }
   }
   return false;
 }
 
 /// getTruncatedShiftCount - check whether the shift count for a machine operand
 /// is non-zero.
 inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
                                               unsigned ShiftAmtOperandIdx) {
   // The shift count is six bits with the REX.W prefix and five bits without.
   unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
   unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm();
   return Imm & ShiftCountMask;
 }
 
 /// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate
 /// can be represented by a LEA instruction.
 inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
   // Left shift instructions can be transformed into load-effective-address
   // instructions if we can encode them appropriately.
   // A LEA instruction utilizes a SIB byte to encode it's scale factor.
   // The SIB.scale field is two bits wide which means that we can encode any
   // shift amount less than 4.
   return ShAmt < 4 && ShAmt > 0;
 }
 
 bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
                                   unsigned Opc, bool AllowSP,
                                   unsigned &NewSrc, bool &isKill, bool &isUndef,
                                   MachineOperand &ImplicitOp) const {
   MachineFunction &MF = *MI->getParent()->getParent();
   const TargetRegisterClass *RC;
   if (AllowSP) {
     RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
   } else {
     RC = Opc != X86::LEA32r ?
       &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
   }
   unsigned SrcReg = Src.getReg();
 
   // For both LEA64 and LEA32 the register already has essentially the right
   // type (32-bit or 64-bit) we may just need to forbid SP.
   if (Opc != X86::LEA64_32r) {
     NewSrc = SrcReg;
     isKill = Src.isKill();
     isUndef = Src.isUndef();
 
     if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
         !MF.getRegInfo().constrainRegClass(NewSrc, RC))
       return false;
 
     return true;
   }
 
   // This is for an LEA64_32r and incoming registers are 32-bit. One way or
   // another we need to add 64-bit registers to the final MI.
   if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
     ImplicitOp = Src;
     ImplicitOp.setImplicit();
 
     NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64);
     MachineBasicBlock::LivenessQueryResult LQR =
       MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
 
     switch (LQR) {
     case MachineBasicBlock::LQR_Unknown:
       // We can't give sane liveness flags to the instruction, abandon LEA
       // formation.
       return false;
     case MachineBasicBlock::LQR_Live:
       isKill = MI->killsRegister(SrcReg);
       isUndef = false;
       break;
     default:
       // The physreg itself is dead, so we have to use it as an <undef>.
       isKill = false;
       isUndef = true;
       break;
     }
   } else {
     // Virtual register of the wrong class, we have to create a temporary 64-bit
     // vreg to feed into the LEA.
     NewSrc = MF.getRegInfo().createVirtualRegister(RC);
     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
             get(TargetOpcode::COPY))
       .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
         .addOperand(Src);
 
     // Which is obviously going to be dead after we're done with it.
     isKill = true;
     isUndef = false;
   }
 
   // We've set all the parameters without issue.
   return true;
 }
 
 /// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
 /// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
 /// to a 32-bit superregister and then truncating back down to a 16-bit
 /// subregister.
 MachineInstr *
 X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
                                            MachineFunction::iterator &MFI,
                                            MachineBasicBlock::iterator &MBBI,
                                            LiveVariables *LV) const {
   MachineInstr *MI = MBBI;
   unsigned Dest = MI->getOperand(0).getReg();
   unsigned Src = MI->getOperand(1).getReg();
   bool isDead = MI->getOperand(0).isDead();
   bool isKill = MI->getOperand(1).isKill();
 
   MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
   unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
   unsigned Opc, leaInReg;
   if (Subtarget.is64Bit()) {
     Opc = X86::LEA64_32r;
     leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
   } else {
     Opc = X86::LEA32r;
     leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
   }
 
   // Build and insert into an implicit UNDEF value. This is OK because
   // well be shifting and then extracting the lower 16-bits.
   // This has the potential to cause partial register stall. e.g.
   //   movw    (%rbp,%rcx,2), %dx
   //   leal    -65(%rdx), %esi
   // But testing has shown this *does* help performance in 64-bit mode (at
   // least on modern x86 machines).
   BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
   MachineInstr *InsMI =
     BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
     .addReg(leaInReg, RegState::Define, X86::sub_16bit)
     .addReg(Src, getKillRegState(isKill));
 
   MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
                                     get(Opc), leaOutReg);
   switch (MIOpc) {
   default: llvm_unreachable("Unreachable!");
   case X86::SHL16ri: {
     unsigned ShAmt = MI->getOperand(2).getImm();
     MIB.addReg(0).addImm(1 << ShAmt)
        .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
     break;
   }
   case X86::INC16r:
     addRegOffset(MIB, leaInReg, true, 1);
     break;
   case X86::DEC16r:
     addRegOffset(MIB, leaInReg, true, -1);
     break;
   case X86::ADD16ri:
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:
   case X86::ADD16ri8_DB:
     addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());
     break;
   case X86::ADD16rr:
   case X86::ADD16rr_DB: {
     unsigned Src2 = MI->getOperand(2).getReg();
     bool isKill2 = MI->getOperand(2).isKill();
     unsigned leaInReg2 = 0;
     MachineInstr *InsMI2 = nullptr;
     if (Src == Src2) {
       // ADD16rr %reg1028<kill>, %reg1028
       // just a single insert_subreg.
       addRegReg(MIB, leaInReg, true, leaInReg, false);
     } else {
       if (Subtarget.is64Bit())
         leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
       else
         leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
       // Build and insert into an implicit UNDEF value. This is OK because
       // well be shifting and then extracting the lower 16-bits.
       BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
       InsMI2 =
         BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY))
         .addReg(leaInReg2, RegState::Define, X86::sub_16bit)
         .addReg(Src2, getKillRegState(isKill2));
       addRegReg(MIB, leaInReg, true, leaInReg2, true);
     }
     if (LV && isKill2 && InsMI2)
       LV->replaceKillInstruction(Src2, MI, InsMI2);
     break;
   }
   }
 
   MachineInstr *NewMI = MIB;
   MachineInstr *ExtMI =
     BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY))
     .addReg(Dest, RegState::Define | getDeadRegState(isDead))
     .addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
 
   if (LV) {
     // Update live variables
     LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
     LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
     if (isKill)
       LV->replaceKillInstruction(Src, MI, InsMI);
     if (isDead)
       LV->replaceKillInstruction(Dest, MI, ExtMI);
   }
 
   return ExtMI;
 }
 
 /// convertToThreeAddress - This method must be implemented by targets that
 /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
 /// may be able to convert a two-address instruction into a true
 /// three-address instruction on demand.  This allows the X86 target (for
 /// example) to convert ADD and SHL instructions into LEA instructions if they
 /// would require register copies due to two-addressness.
 ///
 /// This method returns a null pointer if the transformation cannot be
 /// performed, otherwise it returns the new instruction.
 ///
 MachineInstr *
 X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                     MachineBasicBlock::iterator &MBBI,
                                     LiveVariables *LV) const {
   MachineInstr *MI = MBBI;
 
   // The following opcodes also sets the condition code register(s). Only
   // convert them to equivalent lea if the condition code register def's
   // are dead!
   if (hasLiveCondCodeDef(MI))
     return nullptr;
 
   MachineFunction &MF = *MI->getParent()->getParent();
   // All instructions input are two-addr instructions.  Get the known operands.
   const MachineOperand &Dest = MI->getOperand(0);
   const MachineOperand &Src = MI->getOperand(1);
 
   MachineInstr *NewMI = nullptr;
   // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
   // we have better subtarget support, enable the 16-bit LEA generation here.
   // 16-bit LEA is also slow on Core2.
   bool DisableLEA16 = true;
   bool is64Bit = Subtarget.is64Bit();
 
   unsigned MIOpc = MI->getOpcode();
   switch (MIOpc) {
   default: return nullptr;
   case X86::SHL64ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     // LEA can't handle RSP.
     if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
         !MF.getRegInfo().constrainRegClass(Src.getReg(),
                                            &X86::GR64_NOSPRegClass))
       return nullptr;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
       .addOperand(Dest)
       .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   case X86::SHL32ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     // LEA can't handle ESP.
     bool isKill, isUndef;
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
                         SrcReg, isKill, isUndef, ImplicitOp))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
       .addOperand(Dest)
       .addReg(0).addImm(1 << ShAmt)
       .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
       .addImm(0).addReg(0);
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
     NewMI = MIB;
 
     break;
   }
   case X86::SHL16ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
       .addOperand(Dest)
       .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   case X86::INC64r:
   case X86::INC32r: {
     assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
     unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
       : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
     bool isKill, isUndef;
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
                         SrcReg, isKill, isUndef, ImplicitOp))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
         .addOperand(Dest)
         .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
 
     NewMI = addOffset(MIB, 1);
     break;
   }
   case X86::INC16r:
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
                      : nullptr;
     assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
     NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                       .addOperand(Dest).addOperand(Src), 1);
     break;
   case X86::DEC64r:
   case X86::DEC32r: {
     assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
     unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
       : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
 
     bool isKill, isUndef;
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
                         SrcReg, isKill, isUndef, ImplicitOp))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
         .addOperand(Dest)
         .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
 
     NewMI = addOffset(MIB, -1);
 
     break;
   }
   case X86::DEC16r:
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
                      : nullptr;
     assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
     NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                       .addOperand(Dest).addOperand(Src), -1);
     break;
   case X86::ADD64rr:
   case X86::ADD64rr_DB:
   case X86::ADD32rr:
   case X86::ADD32rr_DB: {
     assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
     unsigned Opc;
     if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
       Opc = X86::LEA64r;
     else
       Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     bool isKill, isUndef;
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
                         SrcReg, isKill, isUndef, ImplicitOp))
       return nullptr;
 
     const MachineOperand &Src2 = MI->getOperand(2);
     bool isKill2, isUndef2;
     unsigned SrcReg2;
     MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
                         SrcReg2, isKill2, isUndef2, ImplicitOp2))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
       .addOperand(Dest);
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
     if (ImplicitOp2.getReg() != 0)
       MIB.addOperand(ImplicitOp2);
 
     NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
 
     // Preserve undefness of the operands.
     NewMI->getOperand(1).setIsUndef(isUndef);
     NewMI->getOperand(3).setIsUndef(isUndef2);
 
     if (LV && Src2.isKill())
       LV->replaceKillInstruction(SrcReg2, MI, NewMI);
     break;
   }
   case X86::ADD16rr:
   case X86::ADD16rr_DB: {
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
                      : nullptr;
     assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
     unsigned Src2 = MI->getOperand(2).getReg();
     bool isKill2 = MI->getOperand(2).isKill();
     NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                       .addOperand(Dest),
                       Src.getReg(), Src.isKill(), Src2, isKill2);
 
     // Preserve undefness of the operands.
     bool isUndef = MI->getOperand(1).isUndef();
     bool isUndef2 = MI->getOperand(2).isUndef();
     NewMI->getOperand(1).setIsUndef(isUndef);
     NewMI->getOperand(3).setIsUndef(isUndef2);
 
     if (LV && isKill2)
       LV->replaceKillInstruction(Src2, MI, NewMI);
     break;
   }
   case X86::ADD64ri32:
   case X86::ADD64ri8:
   case X86::ADD64ri32_DB:
   case X86::ADD64ri8_DB:
     assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
     NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
                       .addOperand(Dest).addOperand(Src),
                       MI->getOperand(2).getImm());
     break;
   case X86::ADD32ri:
   case X86::ADD32ri8:
   case X86::ADD32ri_DB:
   case X86::ADD32ri8_DB: {
     assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
     unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     bool isKill, isUndef;
     unsigned SrcReg;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
                         SrcReg, isKill, isUndef, ImplicitOp))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
         .addOperand(Dest)
         .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
     if (ImplicitOp.getReg() != 0)
       MIB.addOperand(ImplicitOp);
 
     NewMI = addOffset(MIB, MI->getOperand(2).getImm());
     break;
   }
   case X86::ADD16ri:
   case X86::ADD16ri8:
   case X86::ADD16ri_DB:
   case X86::ADD16ri8_DB:
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
                      : nullptr;
     assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
     NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                       .addOperand(Dest).addOperand(Src),
                       MI->getOperand(2).getImm());
     break;
   }
 
   if (!NewMI) return nullptr;
 
   if (LV) {  // Update live variables
     if (Src.isKill())
       LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
     if (Dest.isDead())
       LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
   }
 
   MFI->insert(MBBI, NewMI);          // Insert the new inst
   return NewMI;
 }
 
 /// commuteInstruction - We have a few instructions that must be hacked on to
 /// commute them.
 ///
 MachineInstr *
 X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
   switch (MI->getOpcode()) {
   case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
   case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
   case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
   case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
   case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
   case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
     unsigned Opc;
     unsigned Size;
     switch (MI->getOpcode()) {
     default: llvm_unreachable("Unreachable!");
     case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
     case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
     case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
     case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
     case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
     case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
     }
     unsigned Amt = MI->getOperand(3).getImm();
     if (NewMI) {
       MachineFunction &MF = *MI->getParent()->getParent();
       MI = MF.CloneMachineInstr(MI);
       NewMI = false;
     }
     MI->setDesc(get(Opc));
     MI->getOperand(3).setImm(Size-Amt);
     return TargetInstrInfo::commuteInstruction(MI, NewMI);
   }
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
   case X86::PBLENDWrri:
   case X86::VBLENDPDrri:
   case X86::VBLENDPSrri:
   case X86::VBLENDPDYrri:
   case X86::VBLENDPSYrri:
   case X86::VPBLENDDrri:
   case X86::VPBLENDWrri:
   case X86::VPBLENDDYrri:
   case X86::VPBLENDWYrri:{
     unsigned Mask;
     switch (MI->getOpcode()) {
     default: llvm_unreachable("Unreachable!");
     case X86::BLENDPDrri:    Mask = 0x03; break;
     case X86::BLENDPSrri:    Mask = 0x0F; break;
     case X86::PBLENDWrri:    Mask = 0xFF; break;
     case X86::VBLENDPDrri:   Mask = 0x03; break;
     case X86::VBLENDPSrri:   Mask = 0x0F; break;
     case X86::VBLENDPDYrri:  Mask = 0x0F; break;
     case X86::VBLENDPSYrri:  Mask = 0xFF; break;
     case X86::VPBLENDDrri:   Mask = 0x0F; break;
     case X86::VPBLENDWrri:   Mask = 0xFF; break;
     case X86::VPBLENDDYrri:  Mask = 0xFF; break;
     case X86::VPBLENDWYrri:  Mask = 0xFF; break;
     }
     // Only the least significant bits of Imm are used.
     unsigned Imm = MI->getOperand(3).getImm() & Mask;
     if (NewMI) {
       MachineFunction &MF = *MI->getParent()->getParent();
       MI = MF.CloneMachineInstr(MI);
       NewMI = false;
     }
     MI->getOperand(3).setImm(Mask ^ Imm);
     return TargetInstrInfo::commuteInstruction(MI, NewMI);
   }
   case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
   case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
   case X86::CMOVE16rr:  case X86::CMOVE32rr:  case X86::CMOVE64rr:
   case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
   case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
   case X86::CMOVA16rr:  case X86::CMOVA32rr:  case X86::CMOVA64rr:
   case X86::CMOVL16rr:  case X86::CMOVL32rr:  case X86::CMOVL64rr:
   case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
   case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
   case X86::CMOVG16rr:  case X86::CMOVG32rr:  case X86::CMOVG64rr:
   case X86::CMOVS16rr:  case X86::CMOVS32rr:  case X86::CMOVS64rr:
   case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
   case X86::CMOVP16rr:  case X86::CMOVP32rr:  case X86::CMOVP64rr:
   case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
   case X86::CMOVO16rr:  case X86::CMOVO32rr:  case X86::CMOVO64rr:
   case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
     unsigned Opc;
     switch (MI->getOpcode()) {
     default: llvm_unreachable("Unreachable!");
     case X86::CMOVB16rr:  Opc = X86::CMOVAE16rr; break;
     case X86::CMOVB32rr:  Opc = X86::CMOVAE32rr; break;
     case X86::CMOVB64rr:  Opc = X86::CMOVAE64rr; break;
     case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
     case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
     case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
     case X86::CMOVE16rr:  Opc = X86::CMOVNE16rr; break;
     case X86::CMOVE32rr:  Opc = X86::CMOVNE32rr; break;
     case X86::CMOVE64rr:  Opc = X86::CMOVNE64rr; break;
     case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
     case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
     case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
     case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
     case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
     case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
     case X86::CMOVA16rr:  Opc = X86::CMOVBE16rr; break;
     case X86::CMOVA32rr:  Opc = X86::CMOVBE32rr; break;
     case X86::CMOVA64rr:  Opc = X86::CMOVBE64rr; break;
     case X86::CMOVL16rr:  Opc = X86::CMOVGE16rr; break;
     case X86::CMOVL32rr:  Opc = X86::CMOVGE32rr; break;
     case X86::CMOVL64rr:  Opc = X86::CMOVGE64rr; break;
     case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
     case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
     case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
     case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
     case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
     case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
     case X86::CMOVG16rr:  Opc = X86::CMOVLE16rr; break;
     case X86::CMOVG32rr:  Opc = X86::CMOVLE32rr; break;
     case X86::CMOVG64rr:  Opc = X86::CMOVLE64rr; break;
     case X86::CMOVS16rr:  Opc = X86::CMOVNS16rr; break;
     case X86::CMOVS32rr:  Opc = X86::CMOVNS32rr; break;
     case X86::CMOVS64rr:  Opc = X86::CMOVNS64rr; break;
     case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
     case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
     case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
     case X86::CMOVP16rr:  Opc = X86::CMOVNP16rr; break;
     case X86::CMOVP32rr:  Opc = X86::CMOVNP32rr; break;
     case X86::CMOVP64rr:  Opc = X86::CMOVNP64rr; break;
     case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
     case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
     case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
     case X86::CMOVO16rr:  Opc = X86::CMOVNO16rr; break;
     case X86::CMOVO32rr:  Opc = X86::CMOVNO32rr; break;
     case X86::CMOVO64rr:  Opc = X86::CMOVNO64rr; break;
     case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
     case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
     case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
     }
     if (NewMI) {
       MachineFunction &MF = *MI->getParent()->getParent();
       MI = MF.CloneMachineInstr(MI);
       NewMI = false;
     }
     MI->setDesc(get(Opc));
     // Fallthrough intended.
   }
   default:
     return TargetInstrInfo::commuteInstruction(MI, NewMI);
   }
 }
 
 bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
                                          unsigned &SrcOpIdx2) const {
   switch (MI->getOpcode()) {
     case X86::BLENDPDrri:
     case X86::BLENDPSrri:
     case X86::PBLENDWrri:
     case X86::VBLENDPDrri:
     case X86::VBLENDPSrri:
     case X86::VBLENDPDYrri:
     case X86::VBLENDPSYrri:
     case X86::VPBLENDDrri:
     case X86::VPBLENDDYrri:
     case X86::VPBLENDWrri:
     case X86::VPBLENDWYrri:
       SrcOpIdx1 = 1;
       SrcOpIdx2 = 2;
       return true;
     case X86::VFMADDPDr231r:
     case X86::VFMADDPSr231r:
     case X86::VFMADDSDr231r:
     case X86::VFMADDSSr231r:
     case X86::VFMSUBPDr231r:
     case X86::VFMSUBPSr231r:
     case X86::VFMSUBSDr231r:
     case X86::VFMSUBSSr231r:
     case X86::VFNMADDPDr231r:
     case X86::VFNMADDPSr231r:
     case X86::VFNMADDSDr231r:
     case X86::VFNMADDSSr231r:
     case X86::VFNMSUBPDr231r:
     case X86::VFNMSUBPSr231r:
     case X86::VFNMSUBSDr231r:
     case X86::VFNMSUBSSr231r:
     case X86::VFMADDPDr231rY:
     case X86::VFMADDPSr231rY:
     case X86::VFMSUBPDr231rY:
     case X86::VFMSUBPSr231rY:
     case X86::VFNMADDPDr231rY:
     case X86::VFNMADDPSr231rY:
     case X86::VFNMSUBPDr231rY:
     case X86::VFNMSUBPSr231rY:
       SrcOpIdx1 = 2;
       SrcOpIdx2 = 3;
       return true;
     default:
       return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
   }
 }
 
 static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
   switch (BrOpc) {
   default: return X86::COND_INVALID;
   case X86::JE_1:  return X86::COND_E;
   case X86::JNE_1: return X86::COND_NE;
   case X86::JL_1:  return X86::COND_L;
   case X86::JLE_1: return X86::COND_LE;
   case X86::JG_1:  return X86::COND_G;
   case X86::JGE_1: return X86::COND_GE;
   case X86::JB_1:  return X86::COND_B;
   case X86::JBE_1: return X86::COND_BE;
   case X86::JA_1:  return X86::COND_A;
   case X86::JAE_1: return X86::COND_AE;
   case X86::JS_1:  return X86::COND_S;
   case X86::JNS_1: return X86::COND_NS;
   case X86::JP_1:  return X86::COND_P;
   case X86::JNP_1: return X86::COND_NP;
   case X86::JO_1:  return X86::COND_O;
   case X86::JNO_1: return X86::COND_NO;
   }
 }
 
 /// getCondFromSETOpc - return condition code of a SET opcode.
 static X86::CondCode getCondFromSETOpc(unsigned Opc) {
   switch (Opc) {
   default: return X86::COND_INVALID;
   case X86::SETAr:  case X86::SETAm:  return X86::COND_A;
   case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
   case X86::SETBr:  case X86::SETBm:  return X86::COND_B;
   case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
   case X86::SETEr:  case X86::SETEm:  return X86::COND_E;
   case X86::SETGr:  case X86::SETGm:  return X86::COND_G;
   case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
   case X86::SETLr:  case X86::SETLm:  return X86::COND_L;
   case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
   case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
   case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
   case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
   case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
   case X86::SETOr:  case X86::SETOm:  return X86::COND_O;
   case X86::SETPr:  case X86::SETPm:  return X86::COND_P;
   case X86::SETSr:  case X86::SETSm:  return X86::COND_S;
   }
 }
 
 /// getCondFromCmovOpc - return condition code of a CMov opcode.
 X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
   switch (Opc) {
   default: return X86::COND_INVALID;
   case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
   case X86::CMOVA32rr:  case X86::CMOVA64rm:  case X86::CMOVA64rr:
     return X86::COND_A;
   case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
   case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
     return X86::COND_AE;
   case X86::CMOVB16rm:  case X86::CMOVB16rr:  case X86::CMOVB32rm:
   case X86::CMOVB32rr:  case X86::CMOVB64rm:  case X86::CMOVB64rr:
     return X86::COND_B;
   case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
   case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
     return X86::COND_BE;
   case X86::CMOVE16rm:  case X86::CMOVE16rr:  case X86::CMOVE32rm:
   case X86::CMOVE32rr:  case X86::CMOVE64rm:  case X86::CMOVE64rr:
     return X86::COND_E;
   case X86::CMOVG16rm:  case X86::CMOVG16rr:  case X86::CMOVG32rm:
   case X86::CMOVG32rr:  case X86::CMOVG64rm:  case X86::CMOVG64rr:
     return X86::COND_G;
   case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
   case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
     return X86::COND_GE;
   case X86::CMOVL16rm:  case X86::CMOVL16rr:  case X86::CMOVL32rm:
   case X86::CMOVL32rr:  case X86::CMOVL64rm:  case X86::CMOVL64rr:
     return X86::COND_L;
   case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
   case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
     return X86::COND_LE;
   case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
   case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
     return X86::COND_NE;
   case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
   case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
     return X86::COND_NO;
   case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
   case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
     return X86::COND_NP;
   case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
   case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
     return X86::COND_NS;
   case X86::CMOVO16rm:  case X86::CMOVO16rr:  case X86::CMOVO32rm:
   case X86::CMOVO32rr:  case X86::CMOVO64rm:  case X86::CMOVO64rr:
     return X86::COND_O;
   case X86::CMOVP16rm:  case X86::CMOVP16rr:  case X86::CMOVP32rm:
   case X86::CMOVP32rr:  case X86::CMOVP64rm:  case X86::CMOVP64rr:
     return X86::COND_P;
   case X86::CMOVS16rm:  case X86::CMOVS16rr:  case X86::CMOVS32rm:
   case X86::CMOVS32rr:  case X86::CMOVS64rm:  case X86::CMOVS64rr:
     return X86::COND_S;
   }
 }
 
 unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
   switch (CC) {
   default: llvm_unreachable("Illegal condition code!");
   case X86::COND_E:  return X86::JE_1;
   case X86::COND_NE: return X86::JNE_1;
   case X86::COND_L:  return X86::JL_1;
   case X86::COND_LE: return X86::JLE_1;
   case X86::COND_G:  return X86::JG_1;
   case X86::COND_GE: return X86::JGE_1;
   case X86::COND_B:  return X86::JB_1;
   case X86::COND_BE: return X86::JBE_1;
   case X86::COND_A:  return X86::JA_1;
   case X86::COND_AE: return X86::JAE_1;
   case X86::COND_S:  return X86::JS_1;
   case X86::COND_NS: return X86::JNS_1;
   case X86::COND_P:  return X86::JP_1;
   case X86::COND_NP: return X86::JNP_1;
   case X86::COND_O:  return X86::JO_1;
   case X86::COND_NO: return X86::JNO_1;
   }
 }
 
 /// GetOppositeBranchCondition - Return the inverse of the specified condition,
 /// e.g. turning COND_E to COND_NE.
 X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
   switch (CC) {
   default: llvm_unreachable("Illegal condition code!");
   case X86::COND_E:  return X86::COND_NE;
   case X86::COND_NE: return X86::COND_E;
   case X86::COND_L:  return X86::COND_GE;
   case X86::COND_LE: return X86::COND_G;
   case X86::COND_G:  return X86::COND_LE;
   case X86::COND_GE: return X86::COND_L;
   case X86::COND_B:  return X86::COND_AE;
   case X86::COND_BE: return X86::COND_A;
   case X86::COND_A:  return X86::COND_BE;
   case X86::COND_AE: return X86::COND_B;
   case X86::COND_S:  return X86::COND_NS;
   case X86::COND_NS: return X86::COND_S;
   case X86::COND_P:  return X86::COND_NP;
   case X86::COND_NP: return X86::COND_P;
   case X86::COND_O:  return X86::COND_NO;
   case X86::COND_NO: return X86::COND_O;
   }
 }
 
 /// getSwappedCondition - assume the flags are set by MI(a,b), return
 /// the condition code if we modify the instructions such that flags are
 /// set by MI(b,a).
 static X86::CondCode getSwappedCondition(X86::CondCode CC) {
   switch (CC) {
   default: return X86::COND_INVALID;
   case X86::COND_E:  return X86::COND_E;
   case X86::COND_NE: return X86::COND_NE;
   case X86::COND_L:  return X86::COND_G;
   case X86::COND_LE: return X86::COND_GE;
   case X86::COND_G:  return X86::COND_L;
   case X86::COND_GE: return X86::COND_LE;
   case X86::COND_B:  return X86::COND_A;
   case X86::COND_BE: return X86::COND_AE;
   case X86::COND_A:  return X86::COND_B;
   case X86::COND_AE: return X86::COND_BE;
   }
 }
 
 /// getSETFromCond - Return a set opcode for the given condition and
 /// whether it has memory operand.
 unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
   static const uint16_t Opc[16][2] = {
     { X86::SETAr,  X86::SETAm  },
     { X86::SETAEr, X86::SETAEm },
     { X86::SETBr,  X86::SETBm  },
     { X86::SETBEr, X86::SETBEm },
     { X86::SETEr,  X86::SETEm  },
     { X86::SETGr,  X86::SETGm  },
     { X86::SETGEr, X86::SETGEm },
     { X86::SETLr,  X86::SETLm  },
     { X86::SETLEr, X86::SETLEm },
     { X86::SETNEr, X86::SETNEm },
     { X86::SETNOr, X86::SETNOm },
     { X86::SETNPr, X86::SETNPm },
     { X86::SETNSr, X86::SETNSm },
     { X86::SETOr,  X86::SETOm  },
     { X86::SETPr,  X86::SETPm  },
     { X86::SETSr,  X86::SETSm  }
   };
 
   assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
   return Opc[CC][HasMemoryOperand ? 1 : 0];
 }
 
 /// getCMovFromCond - Return a cmov opcode for the given condition,
 /// register size in bytes, and operand type.
 unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
                               bool HasMemoryOperand) {
   static const uint16_t Opc[32][3] = {
     { X86::CMOVA16rr,  X86::CMOVA32rr,  X86::CMOVA64rr  },
     { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
     { X86::CMOVB16rr,  X86::CMOVB32rr,  X86::CMOVB64rr  },
     { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
     { X86::CMOVE16rr,  X86::CMOVE32rr,  X86::CMOVE64rr  },
     { X86::CMOVG16rr,  X86::CMOVG32rr,  X86::CMOVG64rr  },
     { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
     { X86::CMOVL16rr,  X86::CMOVL32rr,  X86::CMOVL64rr  },
     { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
     { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
     { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
     { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
     { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
     { X86::CMOVO16rr,  X86::CMOVO32rr,  X86::CMOVO64rr  },
     { X86::CMOVP16rr,  X86::CMOVP32rr,  X86::CMOVP64rr  },
     { X86::CMOVS16rr,  X86::CMOVS32rr,  X86::CMOVS64rr  },
     { X86::CMOVA16rm,  X86::CMOVA32rm,  X86::CMOVA64rm  },
     { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
     { X86::CMOVB16rm,  X86::CMOVB32rm,  X86::CMOVB64rm  },
     { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
     { X86::CMOVE16rm,  X86::CMOVE32rm,  X86::CMOVE64rm  },
     { X86::CMOVG16rm,  X86::CMOVG32rm,  X86::CMOVG64rm  },
     { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
     { X86::CMOVL16rm,  X86::CMOVL32rm,  X86::CMOVL64rm  },
     { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
     { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
     { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
     { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
     { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
     { X86::CMOVO16rm,  X86::CMOVO32rm,  X86::CMOVO64rm  },
     { X86::CMOVP16rm,  X86::CMOVP32rm,  X86::CMOVP64rm  },
     { X86::CMOVS16rm,  X86::CMOVS32rm,  X86::CMOVS64rm  }
   };
 
   assert(CC < 16 && "Can only handle standard cond codes");
   unsigned Idx = HasMemoryOperand ? 16+CC : CC;
   switch(RegBytes) {
   default: llvm_unreachable("Illegal register size!");
   case 2: return Opc[Idx][0];
   case 4: return Opc[Idx][1];
   case 8: return Opc[Idx][2];
   }
 }
 
 bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
   if (!MI->isTerminator()) return false;
 
   // Conditional branch is a special case.
   if (MI->isBranch() && !MI->isBarrier())
     return true;
   if (!MI->isPredicable())
     return true;
   return !isPredicated(MI);
 }
 
 bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                  MachineBasicBlock *&TBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
   // Start from the bottom of the block and work up, examining the
   // terminator instructions.
   MachineBasicBlock::iterator I = MBB.end();
   MachineBasicBlock::iterator UnCondBrIter = MBB.end();
   while (I != MBB.begin()) {
     --I;
     if (I->isDebugValue())
       continue;
 
     // Working from the bottom, when we see a non-terminator instruction, we're
     // done.
     if (!isUnpredicatedTerminator(I))
       break;
 
     // A terminator that isn't a branch can't easily be handled by this
     // analysis.
     if (!I->isBranch())
       return true;
 
     // Handle unconditional branches.
     if (I->getOpcode() == X86::JMP_1) {
       UnCondBrIter = I;
 
       if (!AllowModify) {
         TBB = I->getOperand(0).getMBB();
         continue;
       }
 
       // If the block has any instructions after a JMP, delete them.
       while (std::next(I) != MBB.end())
         std::next(I)->eraseFromParent();
 
       Cond.clear();
       FBB = nullptr;
 
       // Delete the JMP if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
         TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         UnCondBrIter = MBB.end();
         continue;
       }
 
       // TBB is used to indicate the unconditional destination.
       TBB = I->getOperand(0).getMBB();
       continue;
     }
 
     // Handle conditional branches.
     X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
     if (BranchCode == X86::COND_INVALID)
       return true;  // Can't handle indirect branch.
 
     // Working from the bottom, handle the first conditional branch.
     if (Cond.empty()) {
       MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
       if (AllowModify && UnCondBrIter != MBB.end() &&
           MBB.isLayoutSuccessor(TargetBB)) {
         // If we can modify the code and it ends in something like:
         //
         //     jCC L1
         //     jmp L2
         //   L1:
         //     ...
         //   L2:
         //
         // Then we can change this to:
         //
         //     jnCC L2
         //   L1:
         //     ...
         //   L2:
         //
         // Which is a bit more efficient.
         // We conditionally jump to the fall-through block.
         BranchCode = GetOppositeBranchCondition(BranchCode);
         unsigned JNCC = GetCondBranchFromCond(BranchCode);
         MachineBasicBlock::iterator OldInst = I;
 
         BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
           .addMBB(UnCondBrIter->getOperand(0).getMBB());
         BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
           .addMBB(TargetBB);
 
         OldInst->eraseFromParent();
         UnCondBrIter->eraseFromParent();
 
         // Restart the analysis.
         UnCondBrIter = MBB.end();
         I = MBB.end();
         continue;
       }
 
       FBB = TBB;
       TBB = I->getOperand(0).getMBB();
       Cond.push_back(MachineOperand::CreateImm(BranchCode));
       continue;
     }
 
     // Handle subsequent conditional branches. Only handle the case where all
     // conditional branches branch to the same destination and their condition
     // opcodes fit one of the special multi-branch idioms.
     assert(Cond.size() == 1);
     assert(TBB);
 
     // Only handle the case where all conditional branches branch to the same
     // destination.
     if (TBB != I->getOperand(0).getMBB())
       return true;
 
     // If the conditions are the same, we can leave them alone.
     X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
     if (OldBranchCode == BranchCode)
       continue;
 
     // If they differ, see if they fit one of the known patterns. Theoretically,
     // we could handle more patterns here, but we shouldn't expect to see them
     // if instruction selection has done a reasonable job.
     if ((OldBranchCode == X86::COND_NP &&
          BranchCode == X86::COND_E) ||
         (OldBranchCode == X86::COND_E &&
          BranchCode == X86::COND_NP))
       BranchCode = X86::COND_NP_OR_E;
     else if ((OldBranchCode == X86::COND_P &&
               BranchCode == X86::COND_NE) ||
              (OldBranchCode == X86::COND_NE &&
               BranchCode == X86::COND_P))
       BranchCode = X86::COND_NE_OR_P;
     else
       return true;
 
     // Update the MachineOperand.
     Cond[0].setImm(BranchCode);
   }
 
   return false;
 }
 
 unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
   unsigned Count = 0;
 
   while (I != MBB.begin()) {
     --I;
     if (I->isDebugValue())
       continue;
     if (I->getOpcode() != X86::JMP_1 &&
         getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
       break;
     // Remove the branch.
     I->eraseFromParent();
     I = MBB.end();
     ++Count;
   }
 
   return Count;
 }
 
 unsigned
 X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                            MachineBasicBlock *FBB,
                            const SmallVectorImpl<MachineOperand> &Cond,
                            DebugLoc DL) const {
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 1 || Cond.size() == 0) &&
          "X86 branch conditions have one component!");
 
   if (Cond.empty()) {
     // Unconditional branch?
     assert(!FBB && "Unconditional branch with multiple successors!");
     BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
     return 1;
   }
 
   // Conditional branch.
   unsigned Count = 0;
   X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
   switch (CC) {
   case X86::COND_NP_OR_E:
     // Synthesize NP_OR_E with two branches.
     BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
     ++Count;
     BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB);
     ++Count;
     break;
   case X86::COND_NE_OR_P:
     // Synthesize NE_OR_P with two branches.
     BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
     ++Count;
     BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
     ++Count;
     break;
   default: {
     unsigned Opc = GetCondBranchFromCond(CC);
     BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
     ++Count;
   }
   }
   if (FBB) {
     // Two-way Conditional branch. Insert the second branch.
     BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
     ++Count;
   }
   return Count;
 }
 
 bool X86InstrInfo::
 canInsertSelect(const MachineBasicBlock &MBB,
                 const SmallVectorImpl<MachineOperand> &Cond,
                 unsigned TrueReg, unsigned FalseReg,
                 int &CondCycles, int &TrueCycles, int &FalseCycles) const {
   // Not all subtargets have cmov instructions.
   if (!Subtarget.hasCMov())
     return false;
   if (Cond.size() != 1)
     return false;
   // We cannot do the composite conditions, at least not in SSA form.
   if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
     return false;
 
   // Check register classes.
   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *RC =
     RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
   if (!RC)
     return false;
 
   // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
   if (X86::GR16RegClass.hasSubClassEq(RC) ||
       X86::GR32RegClass.hasSubClassEq(RC) ||
       X86::GR64RegClass.hasSubClassEq(RC)) {
     // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
     // Bridge. Probably Ivy Bridge as well.
     CondCycles = 2;
     TrueCycles = 2;
     FalseCycles = 2;
     return true;
   }
 
   // Can't do vectors.
   return false;
 }
 
 void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I, DebugLoc DL,
                                 unsigned DstReg,
                                 const SmallVectorImpl<MachineOperand> &Cond,
                                 unsigned TrueReg, unsigned FalseReg) const {
    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
    assert(Cond.size() == 1 && "Invalid Cond array");
    unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
                                   MRI.getRegClass(DstReg)->getSize(),
                                   false/*HasMemoryOperand*/);
    BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
 }
 
 /// isHReg - Test if the given register is a physical h register.
 static bool isHReg(unsigned Reg) {
   return X86::GR8_ABCD_HRegClass.contains(Reg);
 }
 
 // Try and copy between VR128/VR64 and GR64 registers.
 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
                                         const X86Subtarget &Subtarget) {
 
   // SrcReg(VR128) -> DestReg(GR64)
   // SrcReg(VR64)  -> DestReg(GR64)
   // SrcReg(GR64)  -> DestReg(VR128)
   // SrcReg(GR64)  -> DestReg(VR64)
 
   bool HasAVX = Subtarget.hasAVX();
   bool HasAVX512 = Subtarget.hasAVX512();
   if (X86::GR64RegClass.contains(DestReg)) {
     if (X86::VR128XRegClass.contains(SrcReg))
       // Copy from a VR128 register to a GR64 register.
       return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr :
                                                X86::MOVPQIto64rr);
     if (X86::VR64RegClass.contains(SrcReg))
       // Copy from a VR64 register to a GR64 register.
       return X86::MOVSDto64rr;
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
     if (X86::VR128XRegClass.contains(DestReg))
       return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr :
                                                X86::MOV64toPQIrr);
     // Copy from a GR64 register to a VR64 register.
     if (X86::VR64RegClass.contains(DestReg))
       return X86::MOV64toSDrr;
   }
 
   // SrcReg(FR32) -> DestReg(GR32)
   // SrcReg(GR32) -> DestReg(FR32)
 
   if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg))
     // Copy from a FR32 register to a GR32 register.
     return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr);
 
   if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
     // Copy from a GR32 register to a FR32 register.
     return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr);
   return 0;
 }
 
 inline static bool MaskRegClassContains(unsigned Reg) {
   return X86::VK8RegClass.contains(Reg) ||
          X86::VK16RegClass.contains(Reg) ||
          X86::VK32RegClass.contains(Reg) ||
          X86::VK64RegClass.contains(Reg) ||
          X86::VK1RegClass.contains(Reg);
 }
 static
 unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
   if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
       X86::VR256XRegClass.contains(DestReg, SrcReg) ||
       X86::VR512RegClass.contains(DestReg, SrcReg)) {
      DestReg = get512BitSuperRegister(DestReg);
      SrcReg = get512BitSuperRegister(SrcReg);
      return X86::VMOVAPSZrr;
   }
   if (MaskRegClassContains(DestReg) &&
       MaskRegClassContains(SrcReg))
     return X86::KMOVWkk;
   if (MaskRegClassContains(DestReg) &&
       (X86::GR32RegClass.contains(SrcReg) ||
        X86::GR16RegClass.contains(SrcReg) ||
        X86::GR8RegClass.contains(SrcReg))) {
     SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32);
     return X86::KMOVWkr;
   }
   if ((X86::GR32RegClass.contains(DestReg) ||
        X86::GR16RegClass.contains(DestReg) ||
        X86::GR8RegClass.contains(DestReg)) &&
        MaskRegClassContains(SrcReg)) {
     DestReg = getX86SubSuperRegister(DestReg, MVT::i32);
     return X86::KMOVWrk;
   }
   return 0;
 }
 
 void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MI, DebugLoc DL,
                                unsigned DestReg, unsigned SrcReg,
                                bool KillSrc) const {
   // First deal with the normal symmetric copies.
   bool HasAVX = Subtarget.hasAVX();
   bool HasAVX512 = Subtarget.hasAVX512();
   unsigned Opc = 0;
   if (X86::GR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV64rr;
   else if (X86::GR32RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV32rr;
   else if (X86::GR16RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV16rr;
   else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
     // Copying to or from a physical H register on x86-64 requires a NOREX
     // move.  Otherwise use a normal move.
     if ((isHReg(DestReg) || isHReg(SrcReg)) &&
         Subtarget.is64Bit()) {
       Opc = X86::MOV8rr_NOREX;
       // Both operands must be encodable without an REX prefix.
       assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
              "8-bit H register can not be copied outside GR8_NOREX");
     } else
       Opc = X86::MOV8rr;
   }
   else if (X86::VR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MMX_MOVQ64rr;
   else if (HasAVX512)
     Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg);
   else if (X86::VR128RegClass.contains(DestReg, SrcReg))
     Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
   else if (X86::VR256RegClass.contains(DestReg, SrcReg))
     Opc = X86::VMOVAPSYrr;
   if (!Opc)
     Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
 
   if (Opc) {
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
       .addReg(SrcReg, getKillRegState(KillSrc));
     return;
   }
 
   // Moving EFLAGS to / from another register requires a push and a pop.
   // Notice that we have to adjust the stack if we don't want to clobber the
   // first frame index. See X86FrameLowering.cpp - clobbersTheStack.
   if (SrcReg == X86::EFLAGS) {
     if (X86::GR64RegClass.contains(DestReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSHF64));
       BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
       return;
     }
     if (X86::GR32RegClass.contains(DestReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSHF32));
       BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
       return;
     }
   }
   if (DestReg == X86::EFLAGS) {
     if (X86::GR64RegClass.contains(SrcReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSH64r))
         .addReg(SrcReg, getKillRegState(KillSrc));
       BuildMI(MBB, MI, DL, get(X86::POPF64));
       return;
     }
     if (X86::GR32RegClass.contains(SrcReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSH32r))
         .addReg(SrcReg, getKillRegState(KillSrc));
       BuildMI(MBB, MI, DL, get(X86::POPF32));
       return;
     }
   }
 
   DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
                << " to " << RI.getName(DestReg) << '\n');
   llvm_unreachable("Cannot emit physreg copy instruction");
 }
 
 static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       const TargetRegisterClass *RC,
                                       bool isStackAligned,
                                       const X86Subtarget &STI,
                                       bool load) {
   if (STI.hasAVX512()) {
     if (X86::VK8RegClass.hasSubClassEq(RC)  ||
       X86::VK16RegClass.hasSubClassEq(RC))
       return load ? X86::KMOVWkm : X86::KMOVWmk;
     if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
       return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
     if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
       return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
     if (X86::VR512RegClass.hasSubClassEq(RC))
       return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
   }
 
   bool HasAVX = STI.hasAVX();
   switch (RC->getSize()) {
   default:
     llvm_unreachable("Unknown spill size");
   case 1:
     assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
     if (STI.is64Bit())
       // Copying to or from a physical H register on x86-64 requires a NOREX
       // move.  Otherwise use a normal move.
       if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
         return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
     return load ? X86::MOV8rm : X86::MOV8mr;
   case 2:
     assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
     return load ? X86::MOV16rm : X86::MOV16mr;
   case 4:
     if (X86::GR32RegClass.hasSubClassEq(RC))
       return load ? X86::MOV32rm : X86::MOV32mr;
     if (X86::FR32RegClass.hasSubClassEq(RC))
       return load ?
         (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
         (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
     if (X86::RFP32RegClass.hasSubClassEq(RC))
       return load ? X86::LD_Fp32m : X86::ST_Fp32m;
     llvm_unreachable("Unknown 4-byte regclass");
   case 8:
     if (X86::GR64RegClass.hasSubClassEq(RC))
       return load ? X86::MOV64rm : X86::MOV64mr;
     if (X86::FR64RegClass.hasSubClassEq(RC))
       return load ?
         (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
         (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
     if (X86::VR64RegClass.hasSubClassEq(RC))
       return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
     if (X86::RFP64RegClass.hasSubClassEq(RC))
       return load ? X86::LD_Fp64m : X86::ST_Fp64m;
     llvm_unreachable("Unknown 8-byte regclass");
   case 10:
     assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
     return load ? X86::LD_Fp80m : X86::ST_FpP80m;
   case 16: {
     assert((X86::VR128RegClass.hasSubClassEq(RC) ||
             X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
       return load ?
         (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) :
         (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
     else
       return load ?
         (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) :
         (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
   }
   case 32:
     assert((X86::VR256RegClass.hasSubClassEq(RC) ||
             X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
       return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
     else
       return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
   case 64:
     assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
     if (isStackAligned)
       return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
     else
       return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
   }
 }
 
 static unsigned getStoreRegOpcode(unsigned SrcReg,
                                   const TargetRegisterClass *RC,
                                   bool isStackAligned,
                                   const X86Subtarget &STI) {
   return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
 }
 
 
 static unsigned getLoadRegOpcode(unsigned DestReg,
                                  const TargetRegisterClass *RC,
                                  bool isStackAligned,
                                  const X86Subtarget &STI) {
   return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
 }
 
 void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MI,
                                        unsigned SrcReg, bool isKill, int FrameIdx,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
   assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
          "Stack slot too small for store");
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = (MF.getTarget()
                         .getSubtargetImpl()
                         ->getFrameLowering()
                         ->getStackAlignment() >= Alignment) ||
                    RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
     .addReg(SrcReg, getKillRegState(isKill));
 }
 
 void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   bool isKill,
                                   SmallVectorImpl<MachineOperand> &Addr,
                                   const TargetRegisterClass *RC,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
     MIB.addOperand(Addr[i]);
   MIB.addReg(SrcReg, getKillRegState(isKill));
   (*MIB).setMemRefs(MMOBegin, MMOEnd);
   NewMIs.push_back(MIB);
 }
 
 
 void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
                                         unsigned DestReg, int FrameIdx,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = (MF.getTarget()
                         .getSubtargetImpl()
                         ->getFrameLowering()
                         ->getStackAlignment() >= Alignment) ||
                    RI.canRealignStack(MF);
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   DebugLoc DL = MBB.findDebugLoc(MI);
   addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
 }
 
 void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  SmallVectorImpl<MachineOperand> &Addr,
                                  const TargetRegisterClass *RC,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
   unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   DebugLoc DL;
   MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
   for (unsigned i = 0, e = Addr.size(); i != e; ++i)
     MIB.addOperand(Addr[i]);
   (*MIB).setMemRefs(MMOBegin, MMOEnd);
   NewMIs.push_back(MIB);
 }
 
 bool X86InstrInfo::
 analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2,
                int &CmpMask, int &CmpValue) const {
   switch (MI->getOpcode()) {
   default: break;
   case X86::CMP64ri32:
   case X86::CMP64ri8:
   case X86::CMP32ri:
   case X86::CMP32ri8:
   case X86::CMP16ri:
   case X86::CMP16ri8:
   case X86::CMP8ri:
     SrcReg = MI->getOperand(0).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
     CmpValue = MI->getOperand(1).getImm();
     return true;
   // A SUB can be used to perform comparison.
   case X86::SUB64rm:
   case X86::SUB32rm:
   case X86::SUB16rm:
   case X86::SUB8rm:
     SrcReg = MI->getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
     CmpValue = 0;
     return true;
   case X86::SUB64rr:
   case X86::SUB32rr:
   case X86::SUB16rr:
   case X86::SUB8rr:
     SrcReg = MI->getOperand(1).getReg();
     SrcReg2 = MI->getOperand(2).getReg();
     CmpMask = ~0;
     CmpValue = 0;
     return true;
   case X86::SUB64ri32:
   case X86::SUB64ri8:
   case X86::SUB32ri:
   case X86::SUB32ri8:
   case X86::SUB16ri:
   case X86::SUB16ri8:
   case X86::SUB8ri:
     SrcReg = MI->getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = ~0;
     CmpValue = MI->getOperand(2).getImm();
     return true;
   case X86::CMP64rr:
   case X86::CMP32rr:
   case X86::CMP16rr:
   case X86::CMP8rr:
     SrcReg = MI->getOperand(0).getReg();
     SrcReg2 = MI->getOperand(1).getReg();
     CmpMask = ~0;
     CmpValue = 0;
     return true;
   case X86::TEST8rr:
   case X86::TEST16rr:
   case X86::TEST32rr:
   case X86::TEST64rr:
     SrcReg = MI->getOperand(0).getReg();
     if (MI->getOperand(1).getReg() != SrcReg) return false;
     // Compare against zero.
     SrcReg2 = 0;
     CmpMask = ~0;
     CmpValue = 0;
     return true;
   }
   return false;
 }
 
 /// isRedundantFlagInstr - check whether the first instruction, whose only
 /// purpose is to update flags, can be made redundant.
 /// CMPrr can be made redundant by SUBrr if the operands are the same.
 /// This function can be extended later on.
 /// SrcReg, SrcRegs: register operands for FlagI.
 /// ImmValue: immediate for FlagI if it takes an immediate.
 inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
                                         unsigned SrcReg2, int ImmValue,
                                         MachineInstr *OI) {
   if (((FlagI->getOpcode() == X86::CMP64rr &&
         OI->getOpcode() == X86::SUB64rr) ||
        (FlagI->getOpcode() == X86::CMP32rr &&
         OI->getOpcode() == X86::SUB32rr)||
        (FlagI->getOpcode() == X86::CMP16rr &&
         OI->getOpcode() == X86::SUB16rr)||
        (FlagI->getOpcode() == X86::CMP8rr &&
         OI->getOpcode() == X86::SUB8rr)) &&
       ((OI->getOperand(1).getReg() == SrcReg &&
         OI->getOperand(2).getReg() == SrcReg2) ||
        (OI->getOperand(1).getReg() == SrcReg2 &&
         OI->getOperand(2).getReg() == SrcReg)))
     return true;
 
   if (((FlagI->getOpcode() == X86::CMP64ri32 &&
         OI->getOpcode() == X86::SUB64ri32) ||
        (FlagI->getOpcode() == X86::CMP64ri8 &&
         OI->getOpcode() == X86::SUB64ri8) ||
        (FlagI->getOpcode() == X86::CMP32ri &&
         OI->getOpcode() == X86::SUB32ri) ||
        (FlagI->getOpcode() == X86::CMP32ri8 &&
         OI->getOpcode() == X86::SUB32ri8) ||
        (FlagI->getOpcode() == X86::CMP16ri &&
         OI->getOpcode() == X86::SUB16ri) ||
        (FlagI->getOpcode() == X86::CMP16ri8 &&
         OI->getOpcode() == X86::SUB16ri8) ||
        (FlagI->getOpcode() == X86::CMP8ri &&
         OI->getOpcode() == X86::SUB8ri)) &&
       OI->getOperand(1).getReg() == SrcReg &&
       OI->getOperand(2).getImm() == ImmValue)
     return true;
   return false;
 }
 
 /// isDefConvertible - check whether the definition can be converted
 /// to remove a comparison against zero.
 inline static bool isDefConvertible(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default: return false;
 
   // The shift instructions only modify ZF if their shift count is non-zero.
   // N.B.: The processor truncates the shift count depending on the encoding.
   case X86::SAR8ri:    case X86::SAR16ri:  case X86::SAR32ri:case X86::SAR64ri:
   case X86::SHR8ri:    case X86::SHR16ri:  case X86::SHR32ri:case X86::SHR64ri:
      return getTruncatedShiftCount(MI, 2) != 0;
 
   // Some left shift instructions can be turned into LEA instructions but only
   // if their flags aren't used. Avoid transforming such instructions.
   case X86::SHL8ri:    case X86::SHL16ri:  case X86::SHL32ri:case X86::SHL64ri:{
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (isTruncatedShiftCountForLEA(ShAmt)) return false;
     return ShAmt != 0;
   }
 
   case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
   case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
      return getTruncatedShiftCount(MI, 3) != 0;
 
   case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
   case X86::SUB32ri8:  case X86::SUB16ri:  case X86::SUB16ri8:
   case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
   case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
   case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
   case X86::DEC64r:    case X86::DEC32r:   case X86::DEC16r: case X86::DEC8r:
   case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
   case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
   case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
   case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
   case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
   case X86::INC64r:    case X86::INC32r:   case X86::INC16r: case X86::INC8r:
   case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
   case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
   case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
   case X86::AND16rr:   case X86::AND8rr:   case X86::AND64rm:
   case X86::AND32rm:   case X86::AND16rm:  case X86::AND8rm:
   case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
   case X86::XOR32ri8:  case X86::XOR16ri:  case X86::XOR16ri8:
   case X86::XOR8ri:    case X86::XOR64rr:  case X86::XOR32rr:
   case X86::XOR16rr:   case X86::XOR8rr:   case X86::XOR64rm:
   case X86::XOR32rm:   case X86::XOR16rm:  case X86::XOR8rm:
   case X86::OR64ri32:  case X86::OR64ri8:  case X86::OR32ri:
   case X86::OR32ri8:   case X86::OR16ri:   case X86::OR16ri8:
   case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
   case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
   case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
   case X86::NEG8r:     case X86::NEG16r:   case X86::NEG32r: case X86::NEG64r:
   case X86::SAR8r1:    case X86::SAR16r1:  case X86::SAR32r1:case X86::SAR64r1:
   case X86::SHR8r1:    case X86::SHR16r1:  case X86::SHR32r1:case X86::SHR64r1:
   case X86::SHL8r1:    case X86::SHL16r1:  case X86::SHL32r1:case X86::SHL64r1:
   case X86::ADC32ri:   case X86::ADC32ri8:
   case X86::ADC32rr:   case X86::ADC64ri32:
   case X86::ADC64ri8:  case X86::ADC64rr:
   case X86::SBB32ri:   case X86::SBB32ri8:
   case X86::SBB32rr:   case X86::SBB64ri32:
   case X86::SBB64ri8:  case X86::SBB64rr:
   case X86::ANDN32rr:  case X86::ANDN32rm:
   case X86::ANDN64rr:  case X86::ANDN64rm:
   case X86::BEXTR32rr: case X86::BEXTR64rr:
   case X86::BEXTR32rm: case X86::BEXTR64rm:
   case X86::BLSI32rr:  case X86::BLSI32rm:
   case X86::BLSI64rr:  case X86::BLSI64rm:
   case X86::BLSMSK32rr:case X86::BLSMSK32rm:
   case X86::BLSMSK64rr:case X86::BLSMSK64rm:
   case X86::BLSR32rr:  case X86::BLSR32rm:
   case X86::BLSR64rr:  case X86::BLSR64rm:
   case X86::BZHI32rr:  case X86::BZHI32rm:
   case X86::BZHI64rr:  case X86::BZHI64rm:
   case X86::LZCNT16rr: case X86::LZCNT16rm:
   case X86::LZCNT32rr: case X86::LZCNT32rm:
   case X86::LZCNT64rr: case X86::LZCNT64rm:
   case X86::POPCNT16rr:case X86::POPCNT16rm:
   case X86::POPCNT32rr:case X86::POPCNT32rm:
   case X86::POPCNT64rr:case X86::POPCNT64rm:
   case X86::TZCNT16rr: case X86::TZCNT16rm:
   case X86::TZCNT32rr: case X86::TZCNT32rm:
   case X86::TZCNT64rr: case X86::TZCNT64rm:
     return true;
   }
 }
 
 /// isUseDefConvertible - check whether the use can be converted
 /// to remove a comparison against zero.
 static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default: return X86::COND_INVALID;
   case X86::LZCNT16rr: case X86::LZCNT16rm:
   case X86::LZCNT32rr: case X86::LZCNT32rm:
   case X86::LZCNT64rr: case X86::LZCNT64rm:
     return X86::COND_B;
   case X86::POPCNT16rr:case X86::POPCNT16rm:
   case X86::POPCNT32rr:case X86::POPCNT32rm:
   case X86::POPCNT64rr:case X86::POPCNT64rm:
     return X86::COND_E;
   case X86::TZCNT16rr: case X86::TZCNT16rm:
   case X86::TZCNT32rr: case X86::TZCNT32rm:
   case X86::TZCNT64rr: case X86::TZCNT64rm:
     return X86::COND_B;
   }
 }
 
 /// optimizeCompareInstr - Check if there exists an earlier instruction that
 /// operates on the same source operands and sets flags in the same way as
 /// Compare; remove Compare if possible.
 bool X86InstrInfo::
 optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
                      int CmpMask, int CmpValue,
                      const MachineRegisterInfo *MRI) const {
   // Check whether we can replace SUB with CMP.
   unsigned NewOpcode = 0;
   switch (CmpInstr->getOpcode()) {
   default: break;
   case X86::SUB64ri32:
   case X86::SUB64ri8:
   case X86::SUB32ri:
   case X86::SUB32ri8:
   case X86::SUB16ri:
   case X86::SUB16ri8:
   case X86::SUB8ri:
   case X86::SUB64rm:
   case X86::SUB32rm:
   case X86::SUB16rm:
   case X86::SUB8rm:
   case X86::SUB64rr:
   case X86::SUB32rr:
   case X86::SUB16rr:
   case X86::SUB8rr: {
     if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
       return false;
     // There is no use of the destination register, we can replace SUB with CMP.
     switch (CmpInstr->getOpcode()) {
     default: llvm_unreachable("Unreachable!");
     case X86::SUB64rm:   NewOpcode = X86::CMP64rm;   break;
     case X86::SUB32rm:   NewOpcode = X86::CMP32rm;   break;
     case X86::SUB16rm:   NewOpcode = X86::CMP16rm;   break;
     case X86::SUB8rm:    NewOpcode = X86::CMP8rm;    break;
     case X86::SUB64rr:   NewOpcode = X86::CMP64rr;   break;
     case X86::SUB32rr:   NewOpcode = X86::CMP32rr;   break;
     case X86::SUB16rr:   NewOpcode = X86::CMP16rr;   break;
     case X86::SUB8rr:    NewOpcode = X86::CMP8rr;    break;
     case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
     case X86::SUB64ri8:  NewOpcode = X86::CMP64ri8;  break;
     case X86::SUB32ri:   NewOpcode = X86::CMP32ri;   break;
     case X86::SUB32ri8:  NewOpcode = X86::CMP32ri8;  break;
     case X86::SUB16ri:   NewOpcode = X86::CMP16ri;   break;
     case X86::SUB16ri8:  NewOpcode = X86::CMP16ri8;  break;
     case X86::SUB8ri:    NewOpcode = X86::CMP8ri;    break;
     }
     CmpInstr->setDesc(get(NewOpcode));
     CmpInstr->RemoveOperand(0);
     // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
     if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
         NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
       return false;
   }
   }
 
   // Get the unique definition of SrcReg.
   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
   if (!MI) return false;
 
   // CmpInstr is the first instruction of the BB.
   MachineBasicBlock::iterator I = CmpInstr, Def = MI;
 
   // If we are comparing against zero, check whether we can use MI to update
   // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
   bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
   if (IsCmpZero && MI->getParent() != CmpInstr->getParent())
     return false;
 
   // If we have a use of the source register between the def and our compare
   // instruction we can eliminate the compare iff the use sets EFLAGS in the
   // right way.
   bool ShouldUpdateCC = false;
   X86::CondCode NewCC = X86::COND_INVALID;
   if (IsCmpZero && !isDefConvertible(MI)) {
     // Scan forward from the use until we hit the use we're looking for or the
     // compare instruction.
     for (MachineBasicBlock::iterator J = MI;; ++J) {
       // Do we have a convertible instruction?
       NewCC = isUseDefConvertible(J);
       if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
           J->getOperand(1).getReg() == SrcReg) {
         assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
         ShouldUpdateCC = true; // Update CC later on.
         // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
         // with the new def.
         MI = Def = J;
         break;
       }
 
       if (J == I)
         return false;
     }
   }
 
   // We are searching for an earlier instruction that can make CmpInstr
   // redundant and that instruction will be saved in Sub.
   MachineInstr *Sub = nullptr;
   const TargetRegisterInfo *TRI = &getRegisterInfo();
 
   // We iterate backward, starting from the instruction before CmpInstr and
   // stop when reaching the definition of a source register or done with the BB.
   // RI points to the instruction before CmpInstr.
   // If the definition is in this basic block, RE points to the definition;
   // otherwise, RE is the rend of the basic block.
   MachineBasicBlock::reverse_iterator
       RI = MachineBasicBlock::reverse_iterator(I),
       RE = CmpInstr->getParent() == MI->getParent() ?
            MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
            CmpInstr->getParent()->rend();
   MachineInstr *Movr0Inst = nullptr;
   for (; RI != RE; ++RI) {
     MachineInstr *Instr = &*RI;
     // Check whether CmpInstr can be made redundant by the current instruction.
     if (!IsCmpZero &&
         isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
       Sub = Instr;
       break;
     }
 
     if (Instr->modifiesRegister(X86::EFLAGS, TRI) ||
         Instr->readsRegister(X86::EFLAGS, TRI)) {
       // This instruction modifies or uses EFLAGS.
 
       // MOV32r0 etc. are implemented with xor which clobbers condition code.
       // They are safe to move up, if the definition to EFLAGS is dead and
       // earlier instructions do not read or write EFLAGS.
       if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 &&
           Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
         Movr0Inst = Instr;
         continue;
       }
 
       // We can't remove CmpInstr.
       return false;
     }
   }
 
   // Return false if no candidates exist.
   if (!IsCmpZero && !Sub)
     return false;
 
   bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
                     Sub->getOperand(2).getReg() == SrcReg);
 
   // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
   // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
   // If we are done with the basic block, we need to check whether EFLAGS is
   // live-out.
   bool IsSafe = false;
   SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
   MachineBasicBlock::iterator E = CmpInstr->getParent()->end();
   for (++I; I != E; ++I) {
     const MachineInstr &Instr = *I;
     bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
     bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
     // We should check the usage if this instruction uses and updates EFLAGS.
     if (!UseEFLAGS && ModifyEFLAGS) {
       // It is safe to remove CmpInstr if EFLAGS is updated again.
       IsSafe = true;
       break;
     }
     if (!UseEFLAGS && !ModifyEFLAGS)
       continue;
 
     // EFLAGS is used by this instruction.
     X86::CondCode OldCC = X86::COND_INVALID;
     bool OpcIsSET = false;
     if (IsCmpZero || IsSwapped) {
       // We decode the condition code from opcode.
       if (Instr.isBranch())
         OldCC = getCondFromBranchOpc(Instr.getOpcode());
       else {
         OldCC = getCondFromSETOpc(Instr.getOpcode());
         if (OldCC != X86::COND_INVALID)
           OpcIsSET = true;
         else
           OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
       }
       if (OldCC == X86::COND_INVALID) return false;
     }
     if (IsCmpZero) {
       switch (OldCC) {
       default: break;
       case X86::COND_A: case X86::COND_AE:
       case X86::COND_B: case X86::COND_BE:
       case X86::COND_G: case X86::COND_GE:
       case X86::COND_L: case X86::COND_LE:
       case X86::COND_O: case X86::COND_NO:
         // CF and OF are used, we can't perform this optimization.
         return false;
       }
 
       // If we're updating the condition code check if we have to reverse the
       // condition.
       if (ShouldUpdateCC)
         switch (OldCC) {
         default:
           return false;
         case X86::COND_E:
           break;
         case X86::COND_NE:
           NewCC = GetOppositeBranchCondition(NewCC);
           break;
         }
     } else if (IsSwapped) {
       // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
       // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
       // We swap the condition code and synthesize the new opcode.
       NewCC = getSwappedCondition(OldCC);
       if (NewCC == X86::COND_INVALID) return false;
     }
 
     if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
       // Synthesize the new opcode.
       bool HasMemoryOperand = Instr.hasOneMemOperand();
       unsigned NewOpc;
       if (Instr.isBranch())
         NewOpc = GetCondBranchFromCond(NewCC);
       else if(OpcIsSET)
         NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
       else {
         unsigned DstReg = Instr.getOperand(0).getReg();
         NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
                                  HasMemoryOperand);
       }
 
       // Push the MachineInstr to OpsToUpdate.
       // If it is safe to remove CmpInstr, the condition code of these
       // instructions will be modified.
       OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
     }
     if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
       // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
       IsSafe = true;
       break;
     }
   }
 
   // If EFLAGS is not killed nor re-defined, we should check whether it is
   // live-out. If it is live-out, do not optimize.
   if ((IsCmpZero || IsSwapped) && !IsSafe) {
     MachineBasicBlock *MBB = CmpInstr->getParent();
     for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
              SE = MBB->succ_end(); SI != SE; ++SI)
       if ((*SI)->isLiveIn(X86::EFLAGS))
         return false;
   }
 
   // The instruction to be updated is either Sub or MI.
   Sub = IsCmpZero ? MI : Sub;
   // Move Movr0Inst to the appropriate place before Sub.
   if (Movr0Inst) {
     // Look backwards until we find a def that doesn't use the current EFLAGS.
     Def = Sub;
     MachineBasicBlock::reverse_iterator
       InsertI = MachineBasicBlock::reverse_iterator(++Def),
                 InsertE = Sub->getParent()->rend();
     for (; InsertI != InsertE; ++InsertI) {
       MachineInstr *Instr = &*InsertI;
       if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
           Instr->modifiesRegister(X86::EFLAGS, TRI)) {
         Sub->getParent()->remove(Movr0Inst);
         Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
                                    Movr0Inst);
         break;
       }
     }
     if (InsertI == InsertE)
       return false;
   }
 
   // Make sure Sub instruction defines EFLAGS and mark the def live.
   unsigned i = 0, e = Sub->getNumOperands();
   for (; i != e; ++i) {
     MachineOperand &MO = Sub->getOperand(i);
     if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
       MO.setIsDead(false);
       break;
     }
   }
   assert(i != e && "Unable to locate a def EFLAGS operand");
 
   CmpInstr->eraseFromParent();
 
   // Modify the condition code of instructions in OpsToUpdate.
   for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++)
     OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second));
   return true;
 }
 
 /// optimizeLoadInstr - Try to remove the load by folding it to a register
 /// operand at the use. We fold the load instructions if load defines a virtual
 /// register, the virtual register is used once in the same BB, and the
 /// instructions in-between do not load or store, and have no side effects.
 MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI,
                                               const MachineRegisterInfo *MRI,
                                               unsigned &FoldAsLoadDefReg,
                                               MachineInstr *&DefMI) const {
   if (FoldAsLoadDefReg == 0)
     return nullptr;
   // To be conservative, if there exists another load, clear the load candidate.
   if (MI->mayLoad()) {
     FoldAsLoadDefReg = 0;
     return nullptr;
   }
 
   // Check whether we can move DefMI here.
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
   bool SawStore = false;
   if (!DefMI->isSafeToMove(this, nullptr, SawStore))
     return nullptr;
 
   // Collect information about virtual register operands of MI.
   unsigned SrcOperandId = 0;
   bool FoundSrcOperand = false;
   for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg())
       continue;
     unsigned Reg = MO.getReg();
     if (Reg != FoldAsLoadDefReg)
       continue;
     // Do not fold if we have a subreg use or a def or multiple uses.
     if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
       return nullptr;
 
     SrcOperandId = i;
     FoundSrcOperand = true;
   }
   if (!FoundSrcOperand)
     return nullptr;
 
   // Check whether we can fold the def into SrcOperandId.
   SmallVector<unsigned, 8> Ops;
   Ops.push_back(SrcOperandId);
   MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
   if (FoldMI) {
     FoldAsLoadDefReg = 0;
     return FoldMI;
   }
 
   return nullptr;
 }
 
 /// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
 /// instruction with two undef reads of the register being defined.  This is
 /// used for mapping:
 ///   %xmm4 = V_SET0
 /// to:
 ///   %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
 ///
 static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
                              const MCInstrDesc &Desc) {
   assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
   unsigned Reg = MIB->getOperand(0).getReg();
   MIB->setDesc(Desc);
 
   // MachineInstr::addOperand() will insert explicit operands before any
   // implicit operands.
   MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
   // But we don't trust that.
   assert(MIB->getOperand(1).getReg() == Reg &&
          MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
   return true;
 }
 
 // LoadStackGuard has so far only been implemented for 64-bit MachO. Different
 // code sequence is needed for other targets.
 static void expandLoadStackGuard(MachineInstrBuilder &MIB,
                                  const TargetInstrInfo &TII) {
   MachineBasicBlock &MBB = *MIB->getParent();
   DebugLoc DL = MIB->getDebugLoc();
   unsigned Reg = MIB->getOperand(0).getReg();
   const GlobalValue *GV =
       cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
   unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
   MachineMemOperand *MMO = MBB.getParent()->
       getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8);
   MachineBasicBlock::iterator I = MIB.getInstr();
 
   BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
       .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
       .addMemOperand(MMO);
   MIB->setDebugLoc(DL);
   MIB->setDesc(TII.get(X86::MOV64rm));
   MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
 }
 
 bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   bool HasAVX = Subtarget.hasAVX();
   MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
   switch (MI->getOpcode()) {
   case X86::MOV32r0:
     return Expand2AddrUndef(MIB, get(X86::XOR32rr));
   case X86::SETB_C8r:
     return Expand2AddrUndef(MIB, get(X86::SBB8rr));
   case X86::SETB_C16r:
     return Expand2AddrUndef(MIB, get(X86::SBB16rr));
   case X86::SETB_C32r:
     return Expand2AddrUndef(MIB, get(X86::SBB32rr));
   case X86::SETB_C64r:
     return Expand2AddrUndef(MIB, get(X86::SBB64rr));
   case X86::V_SET0:
   case X86::FsFLD0SS:
   case X86::FsFLD0SD:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
   case X86::AVX_SET0:
     assert(HasAVX && "AVX not supported");
     return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
   case X86::AVX512_512_SET0:
     return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
   case X86::V_SETALLONES:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
   case X86::AVX2_SETALLONES:
     return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
   case X86::KSET0B:
   case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
   case X86::KSET1B:
   case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
   case TargetOpcode::LOAD_STACK_GUARD:
     expandLoadStackGuard(MIB, *this);
     return true;
   }
   return false;
 }
 
 static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
                                      const SmallVectorImpl<MachineOperand> &MOs,
                                      MachineInstr *MI,
                                      const TargetInstrInfo &TII) {
   // Create the base instruction with the memory operand as the first part.
   // Omit the implicit operands, something BuildMI can't do.
   MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
                                               MI->getDebugLoc(), true);
   MachineInstrBuilder MIB(MF, NewMI);
   unsigned NumAddrOps = MOs.size();
   for (unsigned i = 0; i != NumAddrOps; ++i)
     MIB.addOperand(MOs[i]);
   if (NumAddrOps < 4)  // FrameIndex only
     addOffset(MIB, 0);
 
   // Loop over the rest of the ri operands, converting them over.
   unsigned NumOps = MI->getDesc().getNumOperands()-2;
   for (unsigned i = 0; i != NumOps; ++i) {
     MachineOperand &MO = MI->getOperand(i+2);
     MIB.addOperand(MO);
   }
   for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
     MIB.addOperand(MO);
   }
   return MIB;
 }
 
 static MachineInstr *FuseInst(MachineFunction &MF,
                               unsigned Opcode, unsigned OpNo,
                               const SmallVectorImpl<MachineOperand> &MOs,
                               MachineInstr *MI, const TargetInstrInfo &TII) {
   // Omit the implicit operands, something BuildMI can't do.
   MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
                                               MI->getDebugLoc(), true);
   MachineInstrBuilder MIB(MF, NewMI);
 
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
     if (i == OpNo) {
       assert(MO.isReg() && "Expected to fold into reg operand!");
       unsigned NumAddrOps = MOs.size();
       for (unsigned i = 0; i != NumAddrOps; ++i)
         MIB.addOperand(MOs[i]);
       if (NumAddrOps < 4)  // FrameIndex only
         addOffset(MIB, 0);
     } else {
       MIB.addOperand(MO);
     }
   }
   return MIB;
 }
 
 static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
                                 const SmallVectorImpl<MachineOperand> &MOs,
                                 MachineInstr *MI) {
   MachineFunction &MF = *MI->getParent()->getParent();
   MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
 
   unsigned NumAddrOps = MOs.size();
   for (unsigned i = 0; i != NumAddrOps; ++i)
     MIB.addOperand(MOs[i]);
   if (NumAddrOps < 4)  // FrameIndex only
     addOffset(MIB, 0);
   return MIB.addImm(0);
 }
 
 MachineInstr*
 X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                     MachineInstr *MI, unsigned i,
                                     const SmallVectorImpl<MachineOperand> &MOs,
                                     unsigned Size, unsigned Align,
                                     bool AllowCommute) const {
   const DenseMap<unsigned,
                  std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
   bool isCallRegIndirect = Subtarget.callRegIndirect();
   bool isTwoAddrFold = false;
 
   // Atom favors register form of call. So, we do not fold loads into calls
   // when X86Subtarget is Atom.
   if (isCallRegIndirect &&
     (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) {
     return nullptr;
   }
 
   unsigned NumOps = MI->getDesc().getNumOperands();
   bool isTwoAddr = NumOps > 1 &&
     MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
 
   // FIXME: AsmPrinter doesn't know how to handle
   // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
   if (MI->getOpcode() == X86::ADD32ri &&
       MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
     return nullptr;
 
   MachineInstr *NewMI = nullptr;
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
   if (isTwoAddr && NumOps >= 2 && i < 2 &&
       MI->getOperand(0).isReg() &&
       MI->getOperand(1).isReg() &&
       MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) {
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
     isTwoAddrFold = true;
   } else if (i == 0) { // If operand 0
     if (MI->getOpcode() == X86::MOV32r0) {
       NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
       if (NewMI)
         return NewMI;
     }
 
     OpcodeTablePtr = &RegOp2MemOpTable0;
   } else if (i == 1) {
     OpcodeTablePtr = &RegOp2MemOpTable1;
   } else if (i == 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2;
   } else if (i == 3) {
     OpcodeTablePtr = &RegOp2MemOpTable3;
   } else if (i == 4) {
     OpcodeTablePtr = &RegOp2MemOpTable4;
   }
 
   // If table selected...
   if (OpcodeTablePtr) {
     // Find the Opcode to fuse
     DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
       OpcodeTablePtr->find(MI->getOpcode());
     if (I != OpcodeTablePtr->end()) {
       unsigned Opcode = I->second.first;
       unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
       if (Align < MinAlign)
         return nullptr;
       bool NarrowToMOV32rm = false;
       if (Size) {
         unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
         if (Size < RCSize) {
           // Check if it's safe to fold the load. If the size of the object is
           // narrower than the load width, then it's not.
           if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
             return nullptr;
           // If this is a 64-bit load, but the spill slot is 32, then we can do
           // a 32-bit load which is implicitly zero-extended. This likely is
           // due to live interval analysis remat'ing a load from stack slot.
           if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
             return nullptr;
           Opcode = X86::MOV32rm;
           NarrowToMOV32rm = true;
         }
       }
 
       if (isTwoAddrFold)
         NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
       else
         NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this);
 
       if (NarrowToMOV32rm) {
         // If this is the special case where we use a MOV32rm to load a 32-bit
         // value and zero-extend the top bits. Change the destination register
         // to a 32-bit one.
         unsigned DstReg = NewMI->getOperand(0).getReg();
         if (TargetRegisterInfo::isPhysicalRegister(DstReg))
           NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
         else
           NewMI->getOperand(0).setSubReg(X86::sub_32bit);
       }
       return NewMI;
     }
   }
 
   // If the instruction and target operand are commutable, commute the
   // instruction and try again.
   if (AllowCommute) {
     unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2;
     if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
       bool HasDef = MI->getDesc().getNumDefs();
       unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0;
       unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg();
       unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg();
       bool Tied0 =
           0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
       bool Tied1 =
           0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
 
       // If either of the commutable operands are tied to the destination
       // then we can not commute + fold.
       if ((HasDef && Reg0 == Reg1 && Tied0) ||
           (HasDef && Reg0 == Reg2 && Tied1))
         return nullptr;
 
       if ((CommuteOpIdx1 == OriginalOpIdx) ||
           (CommuteOpIdx2 == OriginalOpIdx)) {
         MachineInstr *CommutedMI = commuteInstruction(MI, false);
         if (!CommutedMI) {
           // Unable to commute.
           return nullptr;
         }
         if (CommutedMI != MI) {
           // New instruction. We can't fold from this.
           CommutedMI->eraseFromParent();
           return nullptr;
         }
 
         // Attempt to fold with the commuted version of the instruction.
         unsigned CommuteOp =
             (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1);
         NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align,
                                       /*AllowCommute=*/false);
         if (NewMI)
           return NewMI;
 
         // Folding failed again - undo the commute before returning.
         MachineInstr *UncommutedMI = commuteInstruction(MI, false);
         if (!UncommutedMI) {
           // Unable to commute.
           return nullptr;
         }
         if (UncommutedMI != MI) {
           // New instruction. It doesn't need to be kept.
           UncommutedMI->eraseFromParent();
           return nullptr;
         }
 
         // Return here to prevent duplicate fuse failure report.
         return nullptr;
       }
     }
   }
 
   // No fusion
   if (PrintFailedFusing && !MI->isCopy())
     dbgs() << "We failed to fuse operand " << i << " in " << *MI;
   return nullptr;
 }
 
 /// hasPartialRegUpdate - Return true for all instructions that only update
 /// the first 32 or 64-bits of the destination register and leave the rest
 /// unmodified. This can be used to avoid folding loads if the instructions
 /// only update part of the destination register, and the non-updated part is
 /// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
 /// instructions breaks the partial register dependency and it can improve
 /// performance. e.g.:
 ///
 ///   movss (%rdi), %xmm0
 ///   cvtss2sd %xmm0, %xmm0
 ///
 /// Instead of
 ///   cvtss2sd (%rdi), %xmm0
 ///
 /// FIXME: This should be turned into a TSFlags.
 ///
 static bool hasPartialRegUpdate(unsigned Opcode) {
   switch (Opcode) {
   case X86::CVTSI2SSrr:
   case X86::CVTSI2SSrm:
   case X86::CVTSI2SS64rr:
   case X86::CVTSI2SS64rm:
   case X86::CVTSI2SDrr:
   case X86::CVTSI2SDrm:
   case X86::CVTSI2SD64rr:
   case X86::CVTSI2SD64rm:
   case X86::CVTSD2SSrr:
   case X86::CVTSD2SSrm:
   case X86::Int_CVTSD2SSrr:
   case X86::Int_CVTSD2SSrm:
   case X86::CVTSS2SDrr:
   case X86::CVTSS2SDrm:
   case X86::Int_CVTSS2SDrr:
   case X86::Int_CVTSS2SDrm:
   case X86::RCPSSr:
   case X86::RCPSSm:
   case X86::RCPSSr_Int:
   case X86::RCPSSm_Int:
   case X86::ROUNDSDr:
   case X86::ROUNDSDm:
   case X86::ROUNDSDr_Int:
   case X86::ROUNDSSr:
   case X86::ROUNDSSm:
   case X86::ROUNDSSr_Int:
   case X86::RSQRTSSr:
   case X86::RSQRTSSm:
   case X86::RSQRTSSr_Int:
   case X86::RSQRTSSm_Int:
   case X86::SQRTSSr:
   case X86::SQRTSSm:
   case X86::SQRTSSr_Int:
   case X86::SQRTSSm_Int:
   case X86::SQRTSDr:
   case X86::SQRTSDm:
   case X86::SQRTSDr_Int:
   case X86::SQRTSDm_Int:
     return true;
   }
 
   return false;
 }
 
 /// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle
 /// instructions we would like before a partial register update.
 unsigned X86InstrInfo::
 getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
                              const TargetRegisterInfo *TRI) const {
   if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
     return 0;
 
   // If MI is marked as reading Reg, the partial register update is wanted.
   const MachineOperand &MO = MI->getOperand(0);
   unsigned Reg = MO.getReg();
   if (TargetRegisterInfo::isVirtualRegister(Reg)) {
     if (MO.readsReg() || MI->readsVirtualRegister(Reg))
       return 0;
   } else {
     if (MI->readsRegister(Reg, TRI))
       return 0;
   }
 
   // If any of the preceding 16 instructions are reading Reg, insert a
   // dependency breaking instruction.  The magic number is based on a few
   // Nehalem experiments.
   return 16;
 }
 
 // Return true for any instruction the copies the high bits of the first source
 // operand into the unused high bits of the destination operand.
 static bool hasUndefRegUpdate(unsigned Opcode) {
   switch (Opcode) {
   case X86::VCVTSI2SSrr:
   case X86::VCVTSI2SSrm:
   case X86::Int_VCVTSI2SSrr:
   case X86::Int_VCVTSI2SSrm:
   case X86::VCVTSI2SS64rr:
   case X86::VCVTSI2SS64rm:
   case X86::Int_VCVTSI2SS64rr:
   case X86::Int_VCVTSI2SS64rm:
   case X86::VCVTSI2SDrr:
   case X86::VCVTSI2SDrm:
   case X86::Int_VCVTSI2SDrr:
   case X86::Int_VCVTSI2SDrm:
   case X86::VCVTSI2SD64rr:
   case X86::VCVTSI2SD64rm:
   case X86::Int_VCVTSI2SD64rr:
   case X86::Int_VCVTSI2SD64rm:
   case X86::VCVTSD2SSrr:
   case X86::VCVTSD2SSrm:
   case X86::Int_VCVTSD2SSrr:
   case X86::Int_VCVTSD2SSrm:
   case X86::VCVTSS2SDrr:
   case X86::VCVTSS2SDrm:
   case X86::Int_VCVTSS2SDrr:
   case X86::Int_VCVTSS2SDrm:
   case X86::VRCPSSr:
   case X86::VRCPSSm:
   case X86::VRCPSSm_Int:
   case X86::VROUNDSDr:
   case X86::VROUNDSDm:
   case X86::VROUNDSDr_Int:
   case X86::VROUNDSSr:
   case X86::VROUNDSSm:
   case X86::VROUNDSSr_Int:
   case X86::VRSQRTSSr:
   case X86::VRSQRTSSm:
   case X86::VRSQRTSSm_Int:
   case X86::VSQRTSSr:
   case X86::VSQRTSSm:
   case X86::VSQRTSSm_Int:
   case X86::VSQRTSDr:
   case X86::VSQRTSDm:
   case X86::VSQRTSDm_Int:
     // AVX-512
   case X86::VCVTSD2SSZrr:
   case X86::VCVTSD2SSZrm:
   case X86::VCVTSS2SDZrr:
   case X86::VCVTSS2SDZrm:
     return true;
   }
 
   return false;
 }
 
 /// Inform the ExeDepsFix pass how many idle instructions we would like before
 /// certain undef register reads.
 ///
 /// This catches the VCVTSI2SD family of instructions:
 ///
 /// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
 ///
 /// We should to be careful *not* to catch VXOR idioms which are presumably
 /// handled specially in the pipeline:
 ///
 /// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
 ///
 /// Like getPartialRegUpdateClearance, this makes a strong assumption that the
 /// high bits that are passed-through are not live.
 unsigned X86InstrInfo::
 getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
                      const TargetRegisterInfo *TRI) const {
   if (!hasUndefRegUpdate(MI->getOpcode()))
     return 0;
 
   // Set the OpNum parameter to the first source operand.
   OpNum = 1;
 
   const MachineOperand &MO = MI->getOperand(OpNum);
   if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
     // Use the same magic number as getPartialRegUpdateClearance.
     return 16;
   }
   return 0;
 }
 
 void X86InstrInfo::
 breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
                           const TargetRegisterInfo *TRI) const {
   unsigned Reg = MI->getOperand(OpNum).getReg();
   // If MI kills this register, the false dependence is already broken.
   if (MI->killsRegister(Reg, TRI))
     return;
   if (X86::VR128RegClass.contains(Reg)) {
     // These instructions are all floating point domain, so xorps is the best
     // choice.
     bool HasAVX = Subtarget.hasAVX();
     unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
       .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
   } else if (X86::VR256RegClass.contains(Reg)) {
     // Use vxorps to clear the full ymm register.
     // It wants to read and write the xmm sub-register.
     unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
       .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
       .addReg(Reg, RegState::ImplicitDefine);
   } else
     return;
   MI->addRegisterKilled(Reg, TRI, true);
 }
 
 MachineInstr*
 X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                     const SmallVectorImpl<unsigned> &Ops,
                                     int FrameIndex) const {
   // Check switch flag
   if (NoFusing) return nullptr;
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
   if (!MF.getFunction()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
     return nullptr;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
   unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
   // If the function stack isn't realigned we don't want to fold instructions
   // that need increased alignment.
   if (!RI.needsStackRealignment(MF))
     Alignment = std::min(Alignment, MF.getTarget()
                                         .getSubtargetImpl()
                                         ->getFrameLowering()
                                         ->getStackAlignment());
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
     switch (MI->getOpcode()) {
     default: return nullptr;
     case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
     case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
     case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
     case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
     }
     // Check if it's safe to fold the load. If the size of the object is
     // narrower than the load width, then it's not.
     if (Size < RCSize)
       return nullptr;
     // Change to CMPXXri r, 0 first.
     MI->setDesc(get(NewOpc));
     MI->getOperand(1).ChangeToImmediate(0);
   } else if (Ops.size() != 1)
     return nullptr;
 
   SmallVector<MachineOperand,4> MOs;
   MOs.push_back(MachineOperand::CreateFI(FrameIndex));
   return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
                                Size, Alignment, /*AllowCommute=*/true);
 }
 
 static bool isPartialRegisterLoad(const MachineInstr &LoadMI,
                                   const MachineFunction &MF) {
   unsigned Opc = LoadMI.getOpcode();
   unsigned RegSize =
       MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
 
   if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4)
     // These instructions only load 32 bits, we can't fold them if the
     // destination register is wider than 32 bits (4 bytes).
     return true;
 
   if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8)
     // These instructions only load 64 bits, we can't fold them if the
     // destination register is wider than 64 bits (8 bytes).
     return true;
 
   return false;
 }
 
 MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                                   MachineInstr *MI,
                                            const SmallVectorImpl<unsigned> &Ops,
                                                   MachineInstr *LoadMI) const {
   // If loading from a FrameIndex, fold directly from the FrameIndex.
   unsigned NumOps = LoadMI->getDesc().getNumOperands();
   int FrameIndex;
   if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
     if (isPartialRegisterLoad(*LoadMI, MF))
       return nullptr;
     return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
   }
 
   // Check switch flag
   if (NoFusing) return nullptr;
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
   if (!MF.getFunction()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
     return nullptr;
 
   // Determine the alignment of the load.
   unsigned Alignment = 0;
   if (LoadMI->hasOneMemOperand())
     Alignment = (*LoadMI->memoperands_begin())->getAlignment();
   else
     switch (LoadMI->getOpcode()) {
     case X86::AVX2_SETALLONES:
     case X86::AVX_SET0:
       Alignment = 32;
       break;
     case X86::V_SET0:
     case X86::V_SETALLONES:
       Alignment = 16;
       break;
     case X86::FsFLD0SD:
       Alignment = 8;
       break;
     case X86::FsFLD0SS:
       Alignment = 4;
       break;
     default:
       return nullptr;
     }
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     switch (MI->getOpcode()) {
     default: return nullptr;
     case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
     case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
     case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
     case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
     }
     // Change to CMPXXri r, 0 first.
     MI->setDesc(get(NewOpc));
     MI->getOperand(1).ChangeToImmediate(0);
   } else if (Ops.size() != 1)
     return nullptr;
 
   // Make sure the subregisters match.
   // Otherwise we risk changing the size of the load.
   if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
     return nullptr;
 
   SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
   switch (LoadMI->getOpcode()) {
   case X86::V_SET0:
   case X86::V_SETALLONES:
   case X86::AVX2_SETALLONES:
   case X86::AVX_SET0:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS: {
     // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
     // Create a constant-pool entry and operands to load from it.
 
     // Medium and large mode can't fold loads this way.
     if (MF.getTarget().getCodeModel() != CodeModel::Small &&
         MF.getTarget().getCodeModel() != CodeModel::Kernel)
       return nullptr;
 
     // x86-32 PIC requires a PIC base register for constant pools.
     unsigned PICBase = 0;
     if (MF.getTarget().getRelocationModel() == Reloc::PIC_) {
       if (Subtarget.is64Bit())
         PICBase = X86::RIP;
       else
         // FIXME: PICBase = getGlobalBaseReg(&MF);
         // This doesn't work for several reasons.
         // 1. GlobalBaseReg may have been spilled.
         // 2. It may not be live at MI.
         return nullptr;
     }
 
     // Create a constant-pool entry.
     MachineConstantPool &MCP = *MF.getConstantPool();
     Type *Ty;
     unsigned Opc = LoadMI->getOpcode();
     if (Opc == X86::FsFLD0SS)
       Ty = Type::getFloatTy(MF.getFunction()->getContext());
     else if (Opc == X86::FsFLD0SD)
       Ty = Type::getDoubleTy(MF.getFunction()->getContext());
     else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
 
     bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
 
     // Create operands to load from the constant pool entry.
     MOs.push_back(MachineOperand::CreateReg(PICBase, false));
     MOs.push_back(MachineOperand::CreateImm(1));
     MOs.push_back(MachineOperand::CreateReg(0, false));
     MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
     MOs.push_back(MachineOperand::CreateReg(0, false));
     break;
   }
   default: {
     if (isPartialRegisterLoad(*LoadMI, MF))
       return nullptr;
 
     // Folding a normal load. Just copy the load's address operands.
     for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
       MOs.push_back(LoadMI->getOperand(i));
     break;
   }
   }
   return foldMemoryOperandImpl(MF, MI, Ops[0], MOs,
                                /*Size=*/0, Alignment, /*AllowCommute=*/true);
 }
 
 
 bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
                                   const SmallVectorImpl<unsigned> &Ops) const {
   // Check switch flag
   if (NoFusing) return 0;
 
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     switch (MI->getOpcode()) {
     default: return false;
     case X86::TEST8rr:
     case X86::TEST16rr:
     case X86::TEST32rr:
     case X86::TEST64rr:
       return true;
     case X86::ADD32ri:
       // FIXME: AsmPrinter doesn't know how to handle
       // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
       if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
         return false;
       break;
     }
   }
 
   if (Ops.size() != 1)
     return false;
 
   unsigned OpNum = Ops[0];
   unsigned Opc = MI->getOpcode();
   unsigned NumOps = MI->getDesc().getNumOperands();
   bool isTwoAddr = NumOps > 1 &&
     MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
 
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
   const DenseMap<unsigned,
                  std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
   if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
   } else if (OpNum == 0) { // If operand 0
     if (Opc == X86::MOV32r0)
       return true;
 
     OpcodeTablePtr = &RegOp2MemOpTable0;
   } else if (OpNum == 1) {
     OpcodeTablePtr = &RegOp2MemOpTable1;
   } else if (OpNum == 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2;
   } else if (OpNum == 3) {
     OpcodeTablePtr = &RegOp2MemOpTable3;
   }
 
   if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
     return true;
   return TargetInstrInfo::canFoldMemoryOperand(MI, Ops);
 }
 
 bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
                                 unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
   DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
     MemOp2RegOpTable.find(MI->getOpcode());
   if (I == MemOp2RegOpTable.end())
     return false;
   unsigned Opc = I->second.first;
   unsigned Index = I->second.second & TB_INDEX_MASK;
   bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
   bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   if (UnfoldLoad && !FoldedLoad)
     return false;
   UnfoldLoad &= FoldedLoad;
   if (UnfoldStore && !FoldedStore)
     return false;
   UnfoldStore &= FoldedStore;
 
   const MCInstrDesc &MCID = get(Opc);
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   if (!MI->hasOneMemOperand() &&
       RC == &X86::VR128RegClass &&
       !Subtarget.isUnalignedMemAccessFast())
     // Without memoperands, loadRegFromAddr and storeRegToStackSlot will
     // conservatively assume the address is unaligned. That's bad for
     // performance.
     return false;
   SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
   SmallVector<MachineOperand,2> BeforeOps;
   SmallVector<MachineOperand,2> AfterOps;
   SmallVector<MachineOperand,4> ImpOps;
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &Op = MI->getOperand(i);
     if (i >= Index && i < Index + X86::AddrNumOperands)
       AddrOps.push_back(Op);
     else if (Op.isReg() && Op.isImplicit())
       ImpOps.push_back(Op);
     else if (i < Index)
       BeforeOps.push_back(Op);
     else if (i > Index)
       AfterOps.push_back(Op);
   }
 
   // Emit the load instruction.
   if (UnfoldLoad) {
     std::pair<MachineInstr::mmo_iterator,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractLoadMemRefs(MI->memoperands_begin(),
                             MI->memoperands_end());
     loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
     if (UnfoldStore) {
       // Address operands cannot be marked isKill.
       for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
         MachineOperand &MO = NewMIs[0]->getOperand(i);
         if (MO.isReg())
           MO.setIsKill(false);
       }
     }
   }
 
   // Emit the data processing instruction.
   MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true);
   MachineInstrBuilder MIB(MF, DataMI);
 
   if (FoldedStore)
     MIB.addReg(Reg, RegState::Define);
   for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
     MIB.addOperand(BeforeOps[i]);
   if (FoldedLoad)
     MIB.addReg(Reg);
   for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
     MIB.addOperand(AfterOps[i]);
   for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
     MachineOperand &MO = ImpOps[i];
     MIB.addReg(MO.getReg(),
                getDefRegState(MO.isDef()) |
                RegState::Implicit |
                getKillRegState(MO.isKill()) |
                getDeadRegState(MO.isDead()) |
                getUndefRegState(MO.isUndef()));
   }
   // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
   switch (DataMI->getOpcode()) {
   default: break;
   case X86::CMP64ri32:
   case X86::CMP64ri8:
   case X86::CMP32ri:
   case X86::CMP32ri8:
   case X86::CMP16ri:
   case X86::CMP16ri8:
   case X86::CMP8ri: {
     MachineOperand &MO0 = DataMI->getOperand(0);
     MachineOperand &MO1 = DataMI->getOperand(1);
     if (MO1.getImm() == 0) {
       unsigned NewOpc;
       switch (DataMI->getOpcode()) {
       default: llvm_unreachable("Unreachable!");
       case X86::CMP64ri8:
       case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
       case X86::CMP32ri8:
       case X86::CMP32ri:   NewOpc = X86::TEST32rr; break;
       case X86::CMP16ri8:
       case X86::CMP16ri:   NewOpc = X86::TEST16rr; break;
       case X86::CMP8ri:    NewOpc = X86::TEST8rr; break;
       }
       DataMI->setDesc(get(NewOpc));
       MO1.ChangeToRegister(MO0.getReg(), false);
     }
   }
   }
   NewMIs.push_back(DataMI);
 
   // Emit the store instruction.
   if (UnfoldStore) {
     const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
     std::pair<MachineInstr::mmo_iterator,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractStoreMemRefs(MI->memoperands_begin(),
                              MI->memoperands_end());
     storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
   }
 
   return true;
 }
 
 bool
 X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                                   SmallVectorImpl<SDNode*> &NewNodes) const {
   if (!N->isMachineOpcode())
     return false;
 
   DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
     MemOp2RegOpTable.find(N->getMachineOpcode());
   if (I == MemOp2RegOpTable.end())
     return false;
   unsigned Opc = I->second.first;
   unsigned Index = I->second.second & TB_INDEX_MASK;
   bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
   bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   const MCInstrDesc &MCID = get(Opc);
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
   unsigned NumDefs = MCID.NumDefs;
   std::vector<SDValue> AddrOps;
   std::vector<SDValue> BeforeOps;
   std::vector<SDValue> AfterOps;
   SDLoc dl(N);
   unsigned NumOps = N->getNumOperands();
   for (unsigned i = 0; i != NumOps-1; ++i) {
     SDValue Op = N->getOperand(i);
     if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
       AddrOps.push_back(Op);
     else if (i < Index-NumDefs)
       BeforeOps.push_back(Op);
     else if (i > Index-NumDefs)
       AfterOps.push_back(Op);
   }
   SDValue Chain = N->getOperand(NumOps-1);
   AddrOps.push_back(Chain);
 
   // Emit the load instruction.
   SDNode *Load = nullptr;
   if (FoldedLoad) {
     EVT VT = *RC->vt_begin();
     std::pair<MachineInstr::mmo_iterator,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
                             cast<MachineSDNode>(N)->memoperands_end());
     if (!(*MMOs.first) &&
         RC == &X86::VR128RegClass &&
         !Subtarget.isUnalignedMemAccessFast())
       // Do not introduce a slow unaligned load.
       return false;
     unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
     Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
                               VT, MVT::Other, AddrOps);
     NewNodes.push_back(Load);
 
     // Preserve memory reference information.
     cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
   }
 
   // Emit the data processing instruction.
   std::vector<EVT> VTs;
   const TargetRegisterClass *DstRC = nullptr;
   if (MCID.getNumDefs() > 0) {
     DstRC = getRegClass(MCID, 0, &RI, MF);
     VTs.push_back(*DstRC->vt_begin());
   }
   for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
     EVT VT = N->getValueType(i);
     if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
       VTs.push_back(VT);
   }
   if (Load)
     BeforeOps.push_back(SDValue(Load, 0));
   std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
   SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
   NewNodes.push_back(NewNode);
 
   // Emit the store instruction.
   if (FoldedStore) {
     AddrOps.pop_back();
     AddrOps.push_back(SDValue(NewNode, 0));
     AddrOps.push_back(Chain);
     std::pair<MachineInstr::mmo_iterator,
               MachineInstr::mmo_iterator> MMOs =
       MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
                              cast<MachineSDNode>(N)->memoperands_end());
     if (!(*MMOs.first) &&
         RC == &X86::VR128RegClass &&
         !Subtarget.isUnalignedMemAccessFast())
       // Do not introduce a slow unaligned store.
       return false;
     unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
     bool isAligned = (*MMOs.first) &&
                      (*MMOs.first)->getAlignment() >= Alignment;
     SDNode *Store =
         DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
                            dl, MVT::Other, AddrOps);
     NewNodes.push_back(Store);
 
     // Preserve memory reference information.
     cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
   }
 
   return true;
 }
 
 unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
                                       bool UnfoldLoad, bool UnfoldStore,
                                       unsigned *LoadRegIndex) const {
   DenseMap<unsigned, std::pair<unsigned,unsigned> >::const_iterator I =
     MemOp2RegOpTable.find(Opc);
   if (I == MemOp2RegOpTable.end())
     return 0;
   bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
   bool FoldedStore = I->second.second & TB_FOLDED_STORE;
   if (UnfoldLoad && !FoldedLoad)
     return 0;
   if (UnfoldStore && !FoldedStore)
     return 0;
   if (LoadRegIndex)
     *LoadRegIndex = I->second.second & TB_INDEX_MASK;
   return I->second.first;
 }
 
 bool
 X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
                                      int64_t &Offset1, int64_t &Offset2) const {
   if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
     return false;
   unsigned Opc1 = Load1->getMachineOpcode();
   unsigned Opc2 = Load2->getMachineOpcode();
   switch (Opc1) {
   default: return false;
   case X86::MOV8rm:
   case X86::MOV16rm:
   case X86::MOV32rm:
   case X86::MOV64rm:
   case X86::LD_Fp32m:
   case X86::LD_Fp64m:
   case X86::LD_Fp80m:
   case X86::MOVSSrm:
   case X86::MOVSDrm:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
   case X86::FsMOVAPSrm:
   case X86::FsMOVAPDrm:
   case X86::MOVAPSrm:
   case X86::MOVUPSrm:
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
   // AVX load instructions
   case X86::VMOVSSrm:
   case X86::VMOVSDrm:
   case X86::FsVMOVAPSrm:
   case X86::FsVMOVAPDrm:
   case X86::VMOVAPSrm:
   case X86::VMOVUPSrm:
   case X86::VMOVAPDrm:
   case X86::VMOVDQArm:
   case X86::VMOVDQUrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPDYrm:
   case X86::VMOVDQAYrm:
   case X86::VMOVDQUYrm:
     break;
   }
   switch (Opc2) {
   default: return false;
   case X86::MOV8rm:
   case X86::MOV16rm:
   case X86::MOV32rm:
   case X86::MOV64rm:
   case X86::LD_Fp32m:
   case X86::LD_Fp64m:
   case X86::LD_Fp80m:
   case X86::MOVSSrm:
   case X86::MOVSDrm:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
   case X86::FsMOVAPSrm:
   case X86::FsMOVAPDrm:
   case X86::MOVAPSrm:
   case X86::MOVUPSrm:
   case X86::MOVAPDrm:
   case X86::MOVDQArm:
   case X86::MOVDQUrm:
   // AVX load instructions
   case X86::VMOVSSrm:
   case X86::VMOVSDrm:
   case X86::FsVMOVAPSrm:
   case X86::FsVMOVAPDrm:
   case X86::VMOVAPSrm:
   case X86::VMOVUPSrm:
   case X86::VMOVAPDrm:
   case X86::VMOVDQArm:
   case X86::VMOVDQUrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPSYrm:
   case X86::VMOVAPDYrm:
   case X86::VMOVDQAYrm:
   case X86::VMOVDQUYrm:
     break;
   }
 
   // Check if chain operands and base addresses match.
   if (Load1->getOperand(0) != Load2->getOperand(0) ||
       Load1->getOperand(5) != Load2->getOperand(5))
     return false;
   // Segment operands should match as well.
   if (Load1->getOperand(4) != Load2->getOperand(4))
     return false;
   // Scale should be 1, Index should be Reg0.
   if (Load1->getOperand(1) == Load2->getOperand(1) &&
       Load1->getOperand(2) == Load2->getOperand(2)) {
     if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
       return false;
 
     // Now let's examine the displacements.
     if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
         isa<ConstantSDNode>(Load2->getOperand(3))) {
       Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
       Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
       return true;
     }
   }
   return false;
 }
 
 bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                            int64_t Offset1, int64_t Offset2,
                                            unsigned NumLoads) const {
   assert(Offset2 > Offset1);
   if ((Offset2 - Offset1) / 8 > 64)
     return false;
 
   unsigned Opc1 = Load1->getMachineOpcode();
   unsigned Opc2 = Load2->getMachineOpcode();
   if (Opc1 != Opc2)
     return false;  // FIXME: overly conservative?
 
   switch (Opc1) {
   default: break;
   case X86::LD_Fp32m:
   case X86::LD_Fp64m:
   case X86::LD_Fp80m:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
     return false;
   }
 
   EVT VT = Load1->getValueType(0);
   switch (VT.getSimpleVT().SimpleTy) {
   default:
     // XMM registers. In 64-bit mode we can be a bit more aggressive since we
     // have 16 of them to play with.
     if (Subtarget.is64Bit()) {
       if (NumLoads >= 3)
         return false;
     } else if (NumLoads) {
       return false;
     }
     break;
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
   case MVT::i64:
   case MVT::f32:
   case MVT::f64:
     if (NumLoads)
       return false;
     break;
   }
 
   return true;
 }
 
 bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
                                           MachineInstr *Second) const {
   // Check if this processor supports macro-fusion. Since this is a minor
   // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
   // proxy for SandyBridge+.
   if (!Subtarget.hasAVX())
     return false;
 
   enum {
     FuseTest,
     FuseCmp,
     FuseInc
   } FuseKind;
 
   switch(Second->getOpcode()) {
   default:
     return false;
   case X86::JE_1:
   case X86::JNE_1:
   case X86::JL_1:
   case X86::JLE_1:
   case X86::JG_1:
   case X86::JGE_1:
     FuseKind = FuseInc;
     break;
   case X86::JB_1:
   case X86::JBE_1:
   case X86::JA_1:
   case X86::JAE_1:
     FuseKind = FuseCmp;
     break;
   case X86::JS_1:
   case X86::JNS_1:
   case X86::JP_1:
   case X86::JNP_1:
   case X86::JO_1:
   case X86::JNO_1:
     FuseKind = FuseTest;
     break;
   }
   switch (First->getOpcode()) {
   default:
     return false;
   case X86::TEST8rr:
   case X86::TEST16rr:
   case X86::TEST32rr:
   case X86::TEST64rr:
   case X86::TEST8ri:
   case X86::TEST16ri:
   case X86::TEST32ri:
   case X86::TEST32i32:
   case X86::TEST64i32:
   case X86::TEST64ri32:
   case X86::TEST8rm:
   case X86::TEST16rm:
   case X86::TEST32rm:
   case X86::TEST64rm:
   case X86::TEST8ri_NOREX:
   case X86::AND16i16:
   case X86::AND16ri:
   case X86::AND16ri8:
   case X86::AND16rm:
   case X86::AND16rr:
   case X86::AND32i32:
   case X86::AND32ri:
   case X86::AND32ri8:
   case X86::AND32rm:
   case X86::AND32rr:
   case X86::AND64i32:
   case X86::AND64ri32:
   case X86::AND64ri8:
   case X86::AND64rm:
   case X86::AND64rr:
   case X86::AND8i8:
   case X86::AND8ri:
   case X86::AND8rm:
   case X86::AND8rr:
     return true;
   case X86::CMP16i16:
   case X86::CMP16ri:
   case X86::CMP16ri8:
   case X86::CMP16rm:
   case X86::CMP16rr:
   case X86::CMP32i32:
   case X86::CMP32ri:
   case X86::CMP32ri8:
   case X86::CMP32rm:
   case X86::CMP32rr:
   case X86::CMP64i32:
   case X86::CMP64ri32:
   case X86::CMP64ri8:
   case X86::CMP64rm:
   case X86::CMP64rr:
   case X86::CMP8i8:
   case X86::CMP8ri:
   case X86::CMP8rm:
   case X86::CMP8rr:
   case X86::ADD16i16:
   case X86::ADD16ri:
   case X86::ADD16ri8:
   case X86::ADD16ri8_DB:
   case X86::ADD16ri_DB:
   case X86::ADD16rm:
   case X86::ADD16rr:
   case X86::ADD16rr_DB:
   case X86::ADD32i32:
   case X86::ADD32ri:
   case X86::ADD32ri8:
   case X86::ADD32ri8_DB:
   case X86::ADD32ri_DB:
   case X86::ADD32rm:
   case X86::ADD32rr:
   case X86::ADD32rr_DB:
   case X86::ADD64i32:
   case X86::ADD64ri32:
   case X86::ADD64ri32_DB:
   case X86::ADD64ri8:
   case X86::ADD64ri8_DB:
   case X86::ADD64rm:
   case X86::ADD64rr:
   case X86::ADD64rr_DB:
   case X86::ADD8i8:
   case X86::ADD8mi:
   case X86::ADD8mr:
   case X86::ADD8ri:
   case X86::ADD8rm:
   case X86::ADD8rr:
   case X86::SUB16i16:
   case X86::SUB16ri:
   case X86::SUB16ri8:
   case X86::SUB16rm:
   case X86::SUB16rr:
   case X86::SUB32i32:
   case X86::SUB32ri:
   case X86::SUB32ri8:
   case X86::SUB32rm:
   case X86::SUB32rr:
   case X86::SUB64i32:
   case X86::SUB64ri32:
   case X86::SUB64ri8:
   case X86::SUB64rm:
   case X86::SUB64rr:
   case X86::SUB8i8:
   case X86::SUB8ri:
   case X86::SUB8rm:
   case X86::SUB8rr:
     return FuseKind == FuseCmp || FuseKind == FuseInc;
   case X86::INC16r:
   case X86::INC32r:
   case X86::INC64r:
   case X86::INC8r:
   case X86::DEC16r:
   case X86::DEC32r:
   case X86::DEC64r:
   case X86::DEC8r:
     return FuseKind == FuseInc;
   }
 }
 
 bool X86InstrInfo::
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 1 && "Invalid X86 branch condition!");
   X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
   if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
     return true;
   Cond[0].setImm(GetOppositeBranchCondition(CC));
   return false;
 }
 
 bool X86InstrInfo::
 isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   // FIXME: Return false for x87 stack register classes for now. We can't
   // allow any loads of these registers before FpGet_ST0_80.
   return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
            RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
 }
 
 /// getGlobalBaseReg - Return a virtual register initialized with the
 /// the global base register value. Output instructions required to
 /// initialize the register in the function entry block, if necessary.
 ///
 /// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
 ///
 unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   assert(!Subtarget.is64Bit() &&
          "X86-64 PIC uses RIP relative addressing");
 
   X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
   unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
   if (GlobalBaseReg != 0)
     return GlobalBaseReg;
 
   // Create the register. The code to initialize it is inserted
   // later, by the CGBR pass (below).
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
   X86FI->setGlobalBaseReg(GlobalBaseReg);
   return GlobalBaseReg;
 }
 
 // These are the replaceable SSE instructions. Some of these have Int variants
 // that we don't include here. We don't want to replace instructions selected
 // by intrinsics.
 static const uint16_t ReplaceableInstrs[][3] = {
   //PackedSingle     PackedDouble    PackedInt
   { X86::MOVAPSmr,   X86::MOVAPDmr,  X86::MOVDQAmr  },
   { X86::MOVAPSrm,   X86::MOVAPDrm,  X86::MOVDQArm  },
   { X86::MOVAPSrr,   X86::MOVAPDrr,  X86::MOVDQArr  },
   { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
   { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
   { X86::MOVNTPSmr,  X86::MOVNTPDmr, X86::MOVNTDQmr },
   { X86::ANDNPSrm,   X86::ANDNPDrm,  X86::PANDNrm   },
   { X86::ANDNPSrr,   X86::ANDNPDrr,  X86::PANDNrr   },
   { X86::ANDPSrm,    X86::ANDPDrm,   X86::PANDrm    },
   { X86::ANDPSrr,    X86::ANDPDrr,   X86::PANDrr    },
   { X86::ORPSrm,     X86::ORPDrm,    X86::PORrm     },
   { X86::ORPSrr,     X86::ORPDrr,    X86::PORrr     },
   { X86::XORPSrm,    X86::XORPDrm,   X86::PXORrm    },
   { X86::XORPSrr,    X86::XORPDrr,   X86::PXORrr    },
   // AVX 128-bit support
   { X86::VMOVAPSmr,  X86::VMOVAPDmr,  X86::VMOVDQAmr  },
   { X86::VMOVAPSrm,  X86::VMOVAPDrm,  X86::VMOVDQArm  },
   { X86::VMOVAPSrr,  X86::VMOVAPDrr,  X86::VMOVDQArr  },
   { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
   { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
   { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
   { X86::VANDNPSrm,  X86::VANDNPDrm,  X86::VPANDNrm   },
   { X86::VANDNPSrr,  X86::VANDNPDrr,  X86::VPANDNrr   },
   { X86::VANDPSrm,   X86::VANDPDrm,   X86::VPANDrm    },
   { X86::VANDPSrr,   X86::VANDPDrr,   X86::VPANDrr    },
   { X86::VORPSrm,    X86::VORPDrm,    X86::VPORrm     },
   { X86::VORPSrr,    X86::VORPDrr,    X86::VPORrr     },
   { X86::VXORPSrm,   X86::VXORPDrm,   X86::VPXORrm    },
   { X86::VXORPSrr,   X86::VXORPDrr,   X86::VPXORrr    },
   // AVX 256-bit support
   { X86::VMOVAPSYmr,   X86::VMOVAPDYmr,   X86::VMOVDQAYmr  },
   { X86::VMOVAPSYrm,   X86::VMOVAPDYrm,   X86::VMOVDQAYrm  },
   { X86::VMOVAPSYrr,   X86::VMOVAPDYrr,   X86::VMOVDQAYrr  },
   { X86::VMOVUPSYmr,   X86::VMOVUPDYmr,   X86::VMOVDQUYmr  },
   { X86::VMOVUPSYrm,   X86::VMOVUPDYrm,   X86::VMOVDQUYrm  },
   { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr }
 };
 
 static const uint16_t ReplaceableInstrsAVX2[][3] = {
   //PackedSingle       PackedDouble       PackedInt
   { X86::VANDNPSYrm,   X86::VANDNPDYrm,   X86::VPANDNYrm   },
   { X86::VANDNPSYrr,   X86::VANDNPDYrr,   X86::VPANDNYrr   },
   { X86::VANDPSYrm,    X86::VANDPDYrm,    X86::VPANDYrm    },
   { X86::VANDPSYrr,    X86::VANDPDYrr,    X86::VPANDYrr    },
   { X86::VORPSYrm,     X86::VORPDYrm,     X86::VPORYrm     },
   { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
   { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
   { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
   { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
   { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
   { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
   { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
   { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
   { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr },
   { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
   { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
   { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
   { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
   { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
   { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}
 };
 
 // FIXME: Some shuffle and unpack instructions have equivalents in different
 // domains, but they require a bit more work than just switching opcodes.
 
 static const uint16_t *lookup(unsigned opcode, unsigned domain) {
   for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
     if (ReplaceableInstrs[i][domain-1] == opcode)
       return ReplaceableInstrs[i];
   return nullptr;
 }
 
 static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
   for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
     if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
       return ReplaceableInstrsAVX2[i];
   return nullptr;
 }
 
 std::pair<uint16_t, uint16_t>
 X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   bool hasAVX2 = Subtarget.hasAVX2();
   uint16_t validDomains = 0;
   if (domain && lookup(MI->getOpcode(), domain))
     validDomains = 0xe;
   else if (domain && lookupAVX2(MI->getOpcode(), domain))
     validDomains = hasAVX2 ? 0xe : 0x6;
   return std::make_pair(domain, validDomains);
 }
 
 void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   assert(Domain>0 && Domain<4 && "Invalid execution domain");
   uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   assert(dom && "Not an SSE instruction");
   const uint16_t *table = lookup(MI->getOpcode(), dom);
   if (!table) { // try the other table
     assert((Subtarget.hasAVX2() || Domain < 3) &&
            "256-bit vector operations only available in AVX2");
     table = lookupAVX2(MI->getOpcode(), dom);
   }
   assert(table && "Cannot change domain");
   MI->setDesc(get(table[Domain-1]));
 }
 
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
 void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(X86::NOOP);
 }
 
 // This code must remain in sync with getJumpInstrTableEntryBound in this class!
 // In particular, getJumpInstrTableEntryBound must always return an upper bound
 // on the encoding lengths of the instructions generated by
 // getUnconditionalBranch and getTrap.
 void X86InstrInfo::getUnconditionalBranch(
     MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const {
   Branch.setOpcode(X86::JMP_1);
   Branch.addOperand(MCOperand::CreateExpr(BranchTarget));
 }
 
 // This code must remain in sync with getJumpInstrTableEntryBound in this class!
 // In particular, getJumpInstrTableEntryBound must always return an upper bound
 // on the encoding lengths of the instructions generated by
 // getUnconditionalBranch and getTrap.
 void X86InstrInfo::getTrap(MCInst &MI) const {
   MI.setOpcode(X86::TRAP);
 }
 
 // See getTrap and getUnconditionalBranch for conditions on the value returned
 // by this function.
 unsigned X86InstrInfo::getJumpInstrTableEntryBound() const {
   // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4
   // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B).
   return 5;
 }
 
 bool X86InstrInfo::isHighLatencyDef(int opc) const {
   switch (opc) {
   default: return false;
   case X86::DIVSDrm:
   case X86::DIVSDrm_Int:
   case X86::DIVSDrr:
   case X86::DIVSDrr_Int:
   case X86::DIVSSrm:
   case X86::DIVSSrm_Int:
   case X86::DIVSSrr:
   case X86::DIVSSrr_Int:
   case X86::SQRTPDm:
   case X86::SQRTPDr:
   case X86::SQRTPSm:
   case X86::SQRTPSr:
   case X86::SQRTSDm:
   case X86::SQRTSDm_Int:
   case X86::SQRTSDr:
   case X86::SQRTSDr_Int:
   case X86::SQRTSSm:
   case X86::SQRTSSm_Int:
   case X86::SQRTSSr:
   case X86::SQRTSSr_Int:
   // AVX instructions with high latency
   case X86::VDIVSDrm:
   case X86::VDIVSDrm_Int:
   case X86::VDIVSDrr:
   case X86::VDIVSDrr_Int:
   case X86::VDIVSSrm:
   case X86::VDIVSSrm_Int:
   case X86::VDIVSSrr:
   case X86::VDIVSSrr_Int:
   case X86::VSQRTPDm:
   case X86::VSQRTPDr:
   case X86::VSQRTPSm:
   case X86::VSQRTPSr:
   case X86::VSQRTSDm:
   case X86::VSQRTSDm_Int:
   case X86::VSQRTSDr:
   case X86::VSQRTSSm:
   case X86::VSQRTSSm_Int:
   case X86::VSQRTSSr:
   case X86::VSQRTPDZm:
   case X86::VSQRTPDZr:
   case X86::VSQRTPSZm:
   case X86::VSQRTPSZr:
   case X86::VSQRTSDZm:
   case X86::VSQRTSDZm_Int:
   case X86::VSQRTSDZr:
   case X86::VSQRTSSZm_Int:
   case X86::VSQRTSSZr:
   case X86::VSQRTSSZm:
   case X86::VDIVSDZrm:
   case X86::VDIVSDZrr:
   case X86::VDIVSSZrm:
   case X86::VDIVSSZrr:
 
   case X86::VGATHERQPSZrm:
   case X86::VGATHERQPDZrm:
   case X86::VGATHERDPDZrm:
   case X86::VGATHERDPSZrm:
   case X86::VPGATHERQDZrm:
   case X86::VPGATHERQQZrm:
   case X86::VPGATHERDDZrm:
   case X86::VPGATHERDQZrm:
   case X86::VSCATTERQPDZmr:
   case X86::VSCATTERQPSZmr:
   case X86::VSCATTERDPDZmr:
   case X86::VSCATTERDPSZmr:
   case X86::VPSCATTERQDZmr:
   case X86::VPSCATTERQQZmr:
   case X86::VPSCATTERDDZmr:
   case X86::VPSCATTERDQZmr:
     return true;
   }
 }
 
 bool X86InstrInfo::
 hasHighOperandLatency(const InstrItineraryData *ItinData,
                       const MachineRegisterInfo *MRI,
                       const MachineInstr *DefMI, unsigned DefIdx,
                       const MachineInstr *UseMI, unsigned UseIdx) const {
   return isHighLatencyDef(DefMI->getOpcode());
 }
 
 namespace {
   /// CGBR - Create Global Base Reg pass. This initializes the PIC
   /// global base register for x86-32.
   struct CGBR : public MachineFunctionPass {
     static char ID;
     CGBR() : MachineFunctionPass(ID) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       const X86TargetMachine *TM =
         static_cast<const X86TargetMachine *>(&MF.getTarget());
 
       // Don't do anything if this is 64-bit as 64-bit PIC
       // uses RIP relative addressing.
       if (TM->getSubtarget<X86Subtarget>().is64Bit())
         return false;
 
       // Only emit a global base reg in PIC mode.
       if (TM->getRelocationModel() != Reloc::PIC_)
         return false;
 
       X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
       unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
 
       // If we didn't need a GlobalBaseReg, don't insert code.
       if (GlobalBaseReg == 0)
         return false;
 
       // Insert the set of GlobalBaseReg into the first MBB of the function
       MachineBasicBlock &FirstMBB = MF.front();
       MachineBasicBlock::iterator MBBI = FirstMBB.begin();
       DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
       const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
       unsigned PC;
       if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT())
         PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
       else
         PC = GlobalBaseReg;
 
       // Operand of MovePCtoStack is completely ignored by asm printer. It's
       // only used in JIT code emission as displacement to pc.
       BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
 
       // If we're using vanilla 'GOT' PIC style, we should use relative addressing
       // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
       if (TM->getSubtarget<X86Subtarget>().isPICStyleGOT()) {
         // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
         BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
           .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
                                         X86II::MO_GOT_ABSOLUTE_ADDRESS);
       }
 
       return true;
     }
 
     const char *getPassName() const override {
       return "X86 PIC Global Base Reg Initialization";
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
 }
 
 char CGBR::ID = 0;
 FunctionPass*
 llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
 
 namespace {
   struct LDTLSCleanup : public MachineFunctionPass {
     static char ID;
     LDTLSCleanup() : MachineFunctionPass(ID) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override {
       X86MachineFunctionInfo* MFI = MF.getInfo<X86MachineFunctionInfo>();
       if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
         // No point folding accesses if there isn't at least two.
         return false;
       }
 
       MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
       return VisitNode(DT->getRootNode(), 0);
     }
 
     // Visit the dominator subtree rooted at Node in pre-order.
     // If TLSBaseAddrReg is non-null, then use that to replace any
     // TLS_base_addr instructions. Otherwise, create the register
     // when the first such instruction is seen, and then use it
     // as we encounter more instructions.
     bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
       MachineBasicBlock *BB = Node->getBlock();
       bool Changed = false;
 
       // Traverse the current block.
       for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
            ++I) {
         switch (I->getOpcode()) {
           case X86::TLS_base_addr32:
           case X86::TLS_base_addr64:
             if (TLSBaseAddrReg)
               I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
             else
               I = SetRegister(I, &TLSBaseAddrReg);
             Changed = true;
             break;
           default:
             break;
         }
       }
 
       // Visit the children of this block in the dominator tree.
       for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
            I != E; ++I) {
         Changed |= VisitNode(*I, TLSBaseAddrReg);
       }
 
       return Changed;
     }
 
     // Replace the TLS_base_addr instruction I with a copy from
     // TLSBaseAddrReg, returning the new instruction.
     MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
                                          unsigned TLSBaseAddrReg) {
       MachineFunction *MF = I->getParent()->getParent();
       const X86TargetMachine *TM =
           static_cast<const X86TargetMachine *>(&MF->getTarget());
       const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
       const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
       // Insert a Copy from TLSBaseAddrReg to RAX/EAX.
       MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
                                    TII->get(TargetOpcode::COPY),
                                    is64Bit ? X86::RAX : X86::EAX)
                                    .addReg(TLSBaseAddrReg);
 
       // Erase the TLS_base_addr instruction.
       I->eraseFromParent();
 
       return Copy;
     }
 
     // Create a virtal register in *TLSBaseAddrReg, and populate it by
     // inserting a copy instruction after I. Returns the new instruction.
     MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
       MachineFunction *MF = I->getParent()->getParent();
       const X86TargetMachine *TM =
           static_cast<const X86TargetMachine *>(&MF->getTarget());
       const bool is64Bit = TM->getSubtarget<X86Subtarget>().is64Bit();
       const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo();
 
       // Create a virtual register for the TLS base address.
       MachineRegisterInfo &RegInfo = MF->getRegInfo();
       *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
                                                       ? &X86::GR64RegClass
                                                       : &X86::GR32RegClass);
 
       // Insert a copy from RAX/EAX to TLSBaseAddrReg.
       MachineInstr *Next = I->getNextNode();
       MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
                                    TII->get(TargetOpcode::COPY),
                                    *TLSBaseAddrReg)
                                    .addReg(is64Bit ? X86::RAX : X86::EAX);
 
       return Copy;
     }
 
     const char *getPassName() const override {
       return "Local Dynamic TLS Access Clean-up";
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<MachineDominatorTree>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
 }
 
 char LDTLSCleanup::ID = 0;
 FunctionPass*
 llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
index 5662e86932c2..4d15467f0ca3 100644
--- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h
@@ -1,468 +1,473 @@
 //===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains the X86 implementation of the TargetInstrInfo class.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H
 #define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
 
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "X86RegisterInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
 #define GET_INSTRINFO_HEADER
 #include "X86GenInstrInfo.inc"
 
 namespace llvm {
   class X86RegisterInfo;
   class X86Subtarget;
 
 namespace X86 {
   // X86 specific condition code. These correspond to X86_*_COND in
   // X86InstrInfo.td. They must be kept in synch.
   enum CondCode {
     COND_A  = 0,
     COND_AE = 1,
     COND_B  = 2,
     COND_BE = 3,
     COND_E  = 4,
     COND_G  = 5,
     COND_GE = 6,
     COND_L  = 7,
     COND_LE = 8,
     COND_NE = 9,
     COND_NO = 10,
     COND_NP = 11,
     COND_NS = 12,
     COND_O  = 13,
     COND_P  = 14,
     COND_S  = 15,
     LAST_VALID_COND = COND_S,
 
     // Artificial condition codes. These are used by AnalyzeBranch
     // to indicate a block terminated with two conditional branches to
     // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
     // which can't be represented on x86 with a single condition. These
     // are never used in MachineInstrs.
     COND_NE_OR_P,
     COND_NP_OR_E,
 
     COND_INVALID
   };
 
   // Turn condition code into conditional branch opcode.
   unsigned GetCondBranchFromCond(CondCode CC);
 
   /// \brief Return a set opcode for the given condition and whether it has
   /// a memory operand.
   unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
 
   /// \brief Return a cmov opcode for the given condition, register size in
   /// bytes, and operand type.
   unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
                            bool HasMemoryOperand = false);
 
   // Turn CMov opcode into condition code.
   CondCode getCondFromCMovOpc(unsigned Opc);
 
   /// GetOppositeBranchCondition - Return the inverse of the specified cond,
   /// e.g. turning COND_E to COND_NE.
   CondCode GetOppositeBranchCondition(CondCode CC);
 }  // end namespace X86;
 
 
 /// isGlobalStubReference - Return true if the specified TargetFlag operand is
 /// a reference to a stub for a global, not the global itself.
 inline static bool isGlobalStubReference(unsigned char TargetFlag) {
   switch (TargetFlag) {
   case X86II::MO_DLLIMPORT: // dllimport stub.
   case X86II::MO_GOTPCREL:  // rip-relative GOT reference.
   case X86II::MO_GOT:       // normal GOT reference.
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Normal $non_lazy_ptr ref.
   case X86II::MO_DARWIN_NONLAZY:                 // Normal $non_lazy_ptr ref.
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref.
     return true;
   default:
     return false;
   }
 }
 
 /// isGlobalRelativeToPICBase - Return true if the specified global value
 /// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg).  If this
 /// is true, the addressing mode has the PIC base register added in (e.g. EBX).
 inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
   switch (TargetFlag) {
   case X86II::MO_GOTOFF:                         // isPICStyleGOT: local global.
   case X86II::MO_GOT:                            // isPICStyleGOT: other global.
   case X86II::MO_PIC_BASE_OFFSET:                // Darwin local global.
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Darwin/32 external global.
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
   case X86II::MO_TLVP:                           // ??? Pretty sure..
     return true;
   default:
     return false;
   }
 }
 
 inline static bool isScale(const MachineOperand &MO) {
   return MO.isImm() &&
     (MO.getImm() == 1 || MO.getImm() == 2 ||
      MO.getImm() == 4 || MO.getImm() == 8);
 }
 
 inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
   if (MI->getOperand(Op).isFI()) return true;
   return Op+X86::AddrSegmentReg <= MI->getNumOperands() &&
     MI->getOperand(Op+X86::AddrBaseReg).isReg() &&
     isScale(MI->getOperand(Op+X86::AddrScaleAmt)) &&
     MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
     (MI->getOperand(Op+X86::AddrDisp).isImm() ||
      MI->getOperand(Op+X86::AddrDisp).isGlobal() ||
      MI->getOperand(Op+X86::AddrDisp).isCPI() ||
      MI->getOperand(Op+X86::AddrDisp).isJTI());
 }
 
 inline static bool isMem(const MachineInstr *MI, unsigned Op) {
   if (MI->getOperand(Op).isFI()) return true;
   return Op+X86::AddrNumOperands <= MI->getNumOperands() &&
     MI->getOperand(Op+X86::AddrSegmentReg).isReg() &&
     isLeaMem(MI, Op);
 }
 
 class X86InstrInfo final : public X86GenInstrInfo {
   X86Subtarget &Subtarget;
   const X86RegisterInfo RI;
 
   /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
   /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
   ///
   typedef DenseMap<unsigned,
                    std::pair<unsigned, unsigned> > RegOp2MemOpTableType;
   RegOp2MemOpTableType RegOp2MemOpTable2Addr;
   RegOp2MemOpTableType RegOp2MemOpTable0;
   RegOp2MemOpTableType RegOp2MemOpTable1;
   RegOp2MemOpTableType RegOp2MemOpTable2;
   RegOp2MemOpTableType RegOp2MemOpTable3;
   RegOp2MemOpTableType RegOp2MemOpTable4;
 
   /// MemOp2RegOpTable - Load / store unfolding opcode map.
   ///
   typedef DenseMap<unsigned,
                    std::pair<unsigned, unsigned> > MemOp2RegOpTableType;
   MemOp2RegOpTableType MemOp2RegOpTable;
 
   static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
                             MemOp2RegOpTableType &M2RTable,
                             unsigned RegOp, unsigned MemOp, unsigned Flags);
 
   virtual void anchor();
 
 public:
   explicit X86InstrInfo(X86Subtarget &STI);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
   const X86RegisterInfo &getRegisterInfo() const { return RI; }
 
+  /// getSPAdjust - This returns the stack pointer adjustment made by
+  /// this instruction. For x86, we need to handle more complex call
+  /// sequences involving PUSHes.
+  int getSPAdjust(const MachineInstr *MI) const override;
+
   /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
   /// extension instruction. That is, it's like a copy where it's legal for the
   /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
   /// true, then it's expected the pre-extension value is available as a subreg
   /// of the result register. This also returns the sub-register index in
   /// SubIdx.
   bool isCoalescableExtInstr(const MachineInstr &MI,
                              unsigned &SrcReg, unsigned &DstReg,
                              unsigned &SubIdx) const override;
 
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                int &FrameIndex) const override;
   /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
   /// reliable for correctness.
   unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
                                      int &FrameIndex) const override;
 
   unsigned isStoreToStackSlot(const MachineInstr *MI,
                               int &FrameIndex) const override;
   /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
   /// stack locations as well.  This uses a heuristic so it isn't
   /// reliable for correctness.
   unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
                                     int &FrameIndex) const override;
 
   bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
                                          AliasAnalysis *AA) const override;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                      unsigned DestReg, unsigned SubIdx,
                      const MachineInstr *Orig,
                      const TargetRegisterInfo &TRI) const override;
 
   /// Given an operand within a MachineInstr, insert preceding code to put it
   /// into the right format for a particular kind of LEA instruction. This may
   /// involve using an appropriate super-register instead (with an implicit use
   /// of the original) or creating a new virtual register and inserting COPY
   /// instructions to get the data into the right class.
   ///
   /// Reference parameters are set to indicate how caller should add this
   /// operand to the LEA instruction.
   bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
                       unsigned LEAOpcode, bool AllowSP,
                       unsigned &NewSrc, bool &isKill,
                       bool &isUndef, MachineOperand &ImplicitOp) const;
 
   /// convertToThreeAddress - This method must be implemented by targets that
   /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
   /// may be able to convert a two-address instruction into a true
   /// three-address instruction on demand.  This allows the X86 target (for
   /// example) to convert ADD and SHL instructions into LEA instructions if they
   /// would require register copies due to two-addressness.
   ///
   /// This method returns a null pointer if the transformation cannot be
   /// performed, otherwise it returns the new instruction.
   ///
   MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
                                       MachineBasicBlock::iterator &MBBI,
                                       LiveVariables *LV) const override;
 
   /// commuteInstruction - We have a few instructions that must be hacked on to
   /// commute them.
   ///
   MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
 
   bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
                              unsigned &SrcOpIdx2) const override;
 
   // Branch analysis.
   bool isUnpredicatedTerminator(const MachineInstr* MI) const override;
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
   unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB,
                         const SmallVectorImpl<MachineOperand> &Cond,
                         DebugLoc DL) const override;
   bool canInsertSelect(const MachineBasicBlock&,
                        const SmallVectorImpl<MachineOperand> &Cond,
                        unsigned, unsigned, int&, int&, int&) const override;
   void insertSelect(MachineBasicBlock &MBB,
                     MachineBasicBlock::iterator MI, DebugLoc DL,
                     unsigned DstReg,
                     const SmallVectorImpl<MachineOperand> &Cond,
                     unsigned TrueReg, unsigned FalseReg) const override;
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator MI, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
                            unsigned SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
                       SmallVectorImpl<MachineOperand> &Addr,
                       const TargetRegisterClass *RC,
                       MachineInstr::mmo_iterator MMOBegin,
                       MachineInstr::mmo_iterator MMOEnd,
                       SmallVectorImpl<MachineInstr*> &NewMIs) const;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI,
                             unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                        SmallVectorImpl<MachineOperand> &Addr,
                        const TargetRegisterClass *RC,
                        MachineInstr::mmo_iterator MMOBegin,
                        MachineInstr::mmo_iterator MMOEnd,
                        SmallVectorImpl<MachineInstr*> &NewMIs) const;
 
   bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
   /// foldMemoryOperand - If this target supports it, fold a load or store of
   /// the specified stack slot into the specified machine instruction for the
   /// specified operand(s).  If this is possible, the target should perform the
   /// folding and return true, otherwise it should return false.  If it folds
   /// the instruction, it is likely that the MachineInstruction the iterator
   /// references has been changed.
   MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr* MI,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       int FrameIndex) const override;
 
   /// foldMemoryOperand - Same as the previous version except it allows folding
   /// of any load and store from / to any address, not just from a specific
   /// stack slot.
   MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr* MI,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr* LoadMI) const override;
 
   /// canFoldMemoryOperand - Returns true if the specified load / store is
   /// folding is possible.
   bool canFoldMemoryOperand(const MachineInstr*,
                             const SmallVectorImpl<unsigned> &) const override;
 
   /// unfoldMemoryOperand - Separate a single instruction which folded a load or
   /// a store or a load and a store into two or more instruction. If this is
   /// possible, returns true as well as the new instructions by reference.
   bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
                          unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
                          SmallVectorImpl<MachineInstr*> &NewMIs) const override;
 
   bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
                            SmallVectorImpl<SDNode*> &NewNodes) const override;
 
   /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
   /// instruction after load / store are unfolded from an instruction of the
   /// specified opcode. It returns zero if the specified unfolding is not
   /// possible. If LoadRegIndex is non-null, it is filled in with the operand
   /// index of the operand which will hold the register holding the loaded
   /// value.
   unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
                               bool UnfoldLoad, bool UnfoldStore,
                               unsigned *LoadRegIndex = nullptr) const override;
 
   /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
   /// to determine if two loads are loading from the same base address. It
   /// should only return true if the base pointers are the same and the
   /// only differences between the two addresses are the offset. It also returns
   /// the offsets by reference.
   bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
                                int64_t &Offset2) const override;
 
   /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
   /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
   /// be scheduled togther. On some targets if two loads are loading from
   /// addresses in the same cache line, it's better if they are scheduled
   /// together. This function takes two integers that represent the load offsets
   /// from the common base address. It returns true if it decides it's desirable
   /// to schedule the two loads together. "NumLoads" is the number of loads that
   /// have already been scheduled after Load1.
   bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
   bool shouldScheduleAdjacent(MachineInstr* First,
                               MachineInstr *Second) const override;
 
   void getNoopForMachoTarget(MCInst &NopInst) const override;
 
   bool
   ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
   /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
   /// instruction that defines the specified register class.
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
 
   /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
   /// would clobber the EFLAGS condition register. Note the result may be
   /// conservative. If it cannot definitely determine the safety after visiting
   /// a few instructions in each direction it assumes it's not safe.
   bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I) const;
 
   static bool isX86_64ExtendedReg(const MachineOperand &MO) {
     if (!MO.isReg()) return false;
     return X86II::isX86_64ExtendedReg(MO.getReg());
   }
 
   /// getGlobalBaseReg - Return a virtual register initialized with the
   /// the global base register value. Output instructions required to
   /// initialize the register in the function entry block, if necessary.
   ///
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 
   std::pair<uint16_t, uint16_t>
   getExecutionDomain(const MachineInstr *MI) const override;
 
   void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override;
 
   unsigned
     getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
                                  const TargetRegisterInfo *TRI) const override;
   unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
                                 const TargetRegisterInfo *TRI) const override;
   void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
                                  const TargetRegisterInfo *TRI) const override;
 
   MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr* MI,
                                       unsigned OpNum,
                                       const SmallVectorImpl<MachineOperand> &MOs,
                                       unsigned Size, unsigned Alignment,
                                       bool AllowCommute) const;
 
   void
   getUnconditionalBranch(MCInst &Branch,
                          const MCSymbolRefExpr *BranchTarget) const override;
 
   void getTrap(MCInst &MI) const override;
 
   unsigned getJumpInstrTableEntryBound() const override;
 
   bool isHighLatencyDef(int opc) const override;
 
   bool hasHighOperandLatency(const InstrItineraryData *ItinData,
                              const MachineRegisterInfo *MRI,
                              const MachineInstr *DefMI, unsigned DefIdx,
                              const MachineInstr *UseMI,
                              unsigned UseIdx) const override;
 
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
   bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
                       unsigned &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
 
   /// optimizeCompareInstr - Check if there exists an earlier instruction that
   /// operates on the same source operands and sets flags in the same way as
   /// Compare; remove Compare if possible.
   bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
                             unsigned SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
   /// optimizeLoadInstr - Try to remove the load by folding it to a register
   /// operand at the use. We fold the load instructions if and only if the
   /// def and use are in the same BB. We only look at one load and see
   /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
   /// defined by the load we are trying to fold. DefMI returns the machine
   /// instruction that defines FoldAsLoadDefReg, and the function returns
   /// the machine instruction generated due to folding.
   MachineInstr* optimizeLoadInstr(MachineInstr *MI,
                                   const MachineRegisterInfo *MRI,
                                   unsigned &FoldAsLoadDefReg,
                                   MachineInstr *&DefMI) const override;
 
 private:
   MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
                                               MachineFunction::iterator &MFI,
                                               MachineBasicBlock::iterator &MBBI,
                                               LiveVariables *LV) const;
 
   /// isFrameOperand - Return true and the FrameIndex if the specified
   /// operand and follow operands form a reference to the stack frame.
   bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
                       int &FrameIndex) const;
 };
 
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index b23a744da686..9fd03a7059cf 100644
--- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -1,168 +1,176 @@
 //===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file declares X86-specific per-machine-function information.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include <vector>
 
 namespace llvm {
 
 /// X86MachineFunctionInfo - This class is derived from MachineFunction and
 /// contains private X86 target-specific information for each MachineFunction.
 class X86MachineFunctionInfo : public MachineFunctionInfo {
   virtual void anchor();
 
   /// ForceFramePointer - True if the function is required to use of frame
   /// pointer for reasons other than it containing dynamic allocation or
   /// that FP eliminatation is turned off. For example, Cygwin main function
   /// contains stack pointer re-alignment code which requires FP.
   bool ForceFramePointer;
 
   /// RestoreBasePointerOffset - Non-zero if the function has base pointer
   /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a
   /// displacement from the frame pointer to a slot where the base pointer
   /// is stashed.
   signed char RestoreBasePointerOffset;
 
   /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
   /// stack frame in bytes.
   unsigned CalleeSavedFrameSize;
 
   /// BytesToPopOnReturn - Number of bytes function pops on return (in addition
   /// to the space used by the return address).
   /// Used on windows platform for stdcall & fastcall name decoration
   unsigned BytesToPopOnReturn;
 
   /// ReturnAddrIndex - FrameIndex for return slot.
   int ReturnAddrIndex;
 
   /// TailCallReturnAddrDelta - The number of bytes by which return address
   /// stack slot is moved as the result of tail call optimization.
   int TailCallReturnAddrDelta;
 
   /// SRetReturnReg - Some subtargets require that sret lowering includes
   /// returning the value of the returned struct in a register. This field
   /// holds the virtual register into which the sret argument is passed.
   unsigned SRetReturnReg;
 
   /// GlobalBaseReg - keeps track of the virtual register initialized for
   /// use as the global base register. This is used for PIC in some PIC
   /// relocation models.
   unsigned GlobalBaseReg;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
   /// RegSaveFrameIndex - X86-64 vararg func register save area.
   int RegSaveFrameIndex;
   /// VarArgsGPOffset - X86-64 vararg func int reg offset.
   unsigned VarArgsGPOffset;
   /// VarArgsFPOffset - X86-64 vararg func fp reg offset.
   unsigned VarArgsFPOffset;
   /// ArgumentStackSize - The number of bytes on stack consumed by the arguments
   /// being passed on the stack.
   unsigned ArgumentStackSize;
   /// NumLocalDynamics - Number of local-dynamic TLS accesses.
   unsigned NumLocalDynamics;
+  /// HasPushSequences - Keeps track of whether this function uses sequences
+  /// of pushes to pass function parameters.
+  bool HasPushSequences;
 
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
   /// that must be forwarded to every musttail call.
   SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
 
 public:
   X86MachineFunctionInfo() : ForceFramePointer(false),
                              RestoreBasePointerOffset(0),
                              CalleeSavedFrameSize(0),
                              BytesToPopOnReturn(0),
                              ReturnAddrIndex(0),
                              TailCallReturnAddrDelta(0),
                              SRetReturnReg(0),
                              GlobalBaseReg(0),
                              VarArgsFrameIndex(0),
                              RegSaveFrameIndex(0),
                              VarArgsGPOffset(0),
                              VarArgsFPOffset(0),
                              ArgumentStackSize(0),
-                             NumLocalDynamics(0) {}
+                             NumLocalDynamics(0),
+                             HasPushSequences(false) {}
 
   explicit X86MachineFunctionInfo(MachineFunction &MF)
     : ForceFramePointer(false),
       RestoreBasePointerOffset(0),
       CalleeSavedFrameSize(0),
       BytesToPopOnReturn(0),
       ReturnAddrIndex(0),
       TailCallReturnAddrDelta(0),
       SRetReturnReg(0),
       GlobalBaseReg(0),
       VarArgsFrameIndex(0),
       RegSaveFrameIndex(0),
       VarArgsGPOffset(0),
       VarArgsFPOffset(0),
       ArgumentStackSize(0),
-      NumLocalDynamics(0) {}
+      NumLocalDynamics(0),
+      HasPushSequences(false) {}
 
   bool getForceFramePointer() const { return ForceFramePointer;}
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
 
+  bool getHasPushSequences() const { return HasPushSequences; }
+  void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; }
+
   bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; }
   void setRestoreBasePointer(const MachineFunction *MF);
   int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
 
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
 
   unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
   void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
 
   int getRAIndex() const { return ReturnAddrIndex; }
   void setRAIndex(int Index) { ReturnAddrIndex = Index; }
 
   int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
   void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
 
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
   unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
   void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
 
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
 
   int getRegSaveFrameIndex() const { return RegSaveFrameIndex; }
   void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; }
 
   unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; }
   void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; }
 
   unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; }
   void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; }
 
   unsigned getArgumentStackSize() const { return ArgumentStackSize; }
   void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; }
 
   unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
   void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
 
   SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() {
     return ForwardedMustTailRegParms;
   }
 };
 
 } // End llvm namespace
 
 #endif
diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 09e651cebfb9..0fa38f453706 100644
--- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -1,730 +1,731 @@
 //===-- X86RegisterInfo.cpp - X86 Register Information --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file contains the X86 implementation of the TargetRegisterInfo class.
 // This file is responsible for the frame pointer elimination optimization
 // on X86.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86RegisterInfo.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
 using namespace llvm;
 
 #define GET_REGINFO_TARGET_DESC
 #include "X86GenRegisterInfo.inc"
 
 cl::opt<bool>
 ForceStackAlign("force-align-stack",
                  cl::desc("Force align the stack to the minimum alignment"
                            " needed for the function."),
                  cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
 EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
           cl::desc("Enable use of a base pointer for complex stack frames"));
 
 X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI)
     : X86GenRegisterInfo(
           (STI.is64Bit() ? X86::RIP : X86::EIP),
           X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false),
           X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true),
           (STI.is64Bit() ? X86::RIP : X86::EIP)),
       Subtarget(STI) {
   X86_MC::InitLLVM2SEHRegisterMapping(this);
 
   // Cache some information.
   Is64Bit = Subtarget.is64Bit();
   IsWin64 = Subtarget.isTargetWin64();
 
   if (Is64Bit) {
     SlotSize = 8;
     StackPtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ?
         X86::RSP : X86::ESP;
     FramePtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ?
         X86::RBP : X86::EBP;
   } else {
     SlotSize = 4;
     StackPtr = X86::ESP;
     FramePtr = X86::EBP;
   }
   // Use a callee-saved register as the base pointer.  These registers must
   // not conflict with any ABI requirements.  For example, in 32-bit mode PIC
   // requires GOT in the EBX register before function calls via PLT GOT pointer.
   BasePtr = Is64Bit ? X86::RBX : X86::ESI;
 }
 
 bool
 X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
   // ExeDepsFixer and PostRAScheduler require liveness.
   return true;
 }
 
 int
 X86RegisterInfo::getSEHRegNum(unsigned i) const {
   return getEncodingValue(i);
 }
 
 const TargetRegisterClass *
 X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
                                        unsigned Idx) const {
   // The sub_8bit sub-register index is more constrained in 32-bit mode.
   // It behaves just like the sub_8bit_hi index.
   if (!Is64Bit && Idx == X86::sub_8bit)
     Idx = X86::sub_8bit_hi;
 
   // Forward to TableGen's default version.
   return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx);
 }
 
 const TargetRegisterClass *
 X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
                                           const TargetRegisterClass *B,
                                           unsigned SubIdx) const {
   // The sub_8bit sub-register index is more constrained in 32-bit mode.
   if (!Is64Bit && SubIdx == X86::sub_8bit) {
     A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
     if (!A)
       return nullptr;
   }
   return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
 }
 
 const TargetRegisterClass*
 X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
   // Don't allow super-classes of GR8_NOREX.  This class is only used after
   // extracting sub_8bit_hi sub-registers.  The H sub-registers cannot be copied
   // to the full GR8 register class in 64-bit mode, so we cannot allow the
   // reigster class inflation.
   //
   // The GR8_NOREX class is always used in a way that won't be constrained to a
   // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the
   // full GR8 class.
   if (RC == &X86::GR8_NOREXRegClass)
     return RC;
 
   const TargetRegisterClass *Super = RC;
   TargetRegisterClass::sc_iterator I = RC->getSuperClasses();
   do {
     switch (Super->getID()) {
     case X86::GR8RegClassID:
     case X86::GR16RegClassID:
     case X86::GR32RegClassID:
     case X86::GR64RegClassID:
     case X86::FR32RegClassID:
     case X86::FR64RegClassID:
     case X86::RFP32RegClassID:
     case X86::RFP64RegClassID:
     case X86::RFP80RegClassID:
     case X86::VR128RegClassID:
     case X86::VR256RegClassID:
       // Don't return a super-class that would shrink the spill size.
       // That can happen with the vector and float classes.
       if (Super->getSize() == RC->getSize())
         return Super;
     }
     Super = *I++;
   } while (Super);
   return RC;
 }
 
 const TargetRegisterClass *
 X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
                                     unsigned Kind) const {
   switch (Kind) {
   default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
   case 0: // Normal GPRs.
     if (Subtarget.isTarget64BitLP64())
       return &X86::GR64RegClass;
     return &X86::GR32RegClass;
   case 1: // Normal GPRs except the stack pointer (for encoding reasons).
     if (Subtarget.isTarget64BitLP64())
       return &X86::GR64_NOSPRegClass;
     return &X86::GR32_NOSPRegClass;
   case 2: // Available for tailcall (not callee-saved GPRs).
     if (Subtarget.isTargetWin64())
       return &X86::GR64_TCW64RegClass;
     else if (Subtarget.is64Bit())
       return &X86::GR64_TCRegClass;
 
     const Function *F = MF.getFunction();
     bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
     if (hasHipeCC)
       return &X86::GR32RegClass;
     return &X86::GR32_TCRegClass;
   }
 }
 
 const TargetRegisterClass *
 X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
   if (RC == &X86::CCRRegClass) {
     if (Is64Bit)
       return &X86::GR64RegClass;
     else
       return &X86::GR32RegClass;
   }
   return RC;
 }
 
 unsigned
 X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                      MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0;
   switch (RC->getID()) {
   default:
     return 0;
   case X86::GR32RegClassID:
     return 4 - FPDiff;
   case X86::GR64RegClassID:
     return 12 - FPDiff;
   case X86::VR128RegClassID:
     return Subtarget.is64Bit() ? 10 : 4;
   case X86::VR64RegClassID:
     return 4;
   }
 }
 
 const MCPhysReg *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   bool HasAVX = Subtarget.hasAVX();
   bool HasAVX512 = Subtarget.hasAVX512();
 
   assert(MF && "MachineFunction required");
   switch (MF->getFunction()->getCallingConv()) {
   case CallingConv::GHC:
   case CallingConv::HiPE:
     return CSR_NoRegs_SaveList;
   case CallingConv::AnyReg:
     if (HasAVX)
       return CSR_64_AllRegs_AVX_SaveList;
     return CSR_64_AllRegs_SaveList;
   case CallingConv::PreserveMost:
     return CSR_64_RT_MostRegs_SaveList;
   case CallingConv::PreserveAll:
     if (HasAVX)
       return CSR_64_RT_AllRegs_AVX_SaveList;
     return CSR_64_RT_AllRegs_SaveList;
   case CallingConv::Intel_OCL_BI: {
     if (HasAVX512 && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
     if (HasAVX512 && Is64Bit)
       return CSR_64_Intel_OCL_BI_AVX512_SaveList;
     if (HasAVX && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
     if (HasAVX && Is64Bit)
       return CSR_64_Intel_OCL_BI_AVX_SaveList;
     if (!HasAVX && !IsWin64 && Is64Bit)
       return CSR_64_Intel_OCL_BI_SaveList;
     break;
   }
   case CallingConv::Cold:
     if (Is64Bit)
       return CSR_64_MostRegs_SaveList;
     break;
   default:
     break;
   }
 
   bool CallsEHReturn = MF->getMMI().callsEHReturn();
   if (Is64Bit) {
     if (IsWin64)
       return CSR_Win64_SaveList;
     if (CallsEHReturn)
       return CSR_64EHRet_SaveList;
     return CSR_64_SaveList;
   }
   if (CallsEHReturn)
     return CSR_32EHRet_SaveList;
   return CSR_32_SaveList;
 }
 
 const uint32_t*
 X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   bool HasAVX = Subtarget.hasAVX();
   bool HasAVX512 = Subtarget.hasAVX512();
 
   switch (CC) {
   case CallingConv::GHC:
   case CallingConv::HiPE:
     return CSR_NoRegs_RegMask;
   case CallingConv::AnyReg:
     if (HasAVX)
       return CSR_64_AllRegs_AVX_RegMask;
     return CSR_64_AllRegs_RegMask;
   case CallingConv::PreserveMost:
     return CSR_64_RT_MostRegs_RegMask;
   case CallingConv::PreserveAll:
     if (HasAVX)
       return CSR_64_RT_AllRegs_AVX_RegMask;
     return CSR_64_RT_AllRegs_RegMask;
   case CallingConv::Intel_OCL_BI: {
     if (HasAVX512 && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
     if (HasAVX512 && Is64Bit)
       return CSR_64_Intel_OCL_BI_AVX512_RegMask;
     if (HasAVX && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
     if (HasAVX && Is64Bit)
       return CSR_64_Intel_OCL_BI_AVX_RegMask;
     if (!HasAVX && !IsWin64 && Is64Bit)
       return CSR_64_Intel_OCL_BI_RegMask;
     break;
   }
   case CallingConv::Cold:
     if (Is64Bit)
       return CSR_64_MostRegs_RegMask;
     break;
   default:
     break;
   }
 
   // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
   // callsEHReturn().
   if (Is64Bit) {
     if (IsWin64)
       return CSR_Win64_RegMask;
     return CSR_64_RegMask;
   }
   return CSR_32_RegMask;
 }
 
 const uint32_t*
 X86RegisterInfo::getNoPreservedMask() const {
   return CSR_NoRegs_RegMask;
 }
 
 BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
 
   // Set the stack-pointer register and its aliases as reserved.
   for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
        ++I)
     Reserved.set(*I);
 
   // Set the instruction pointer register and its aliases as reserved.
   for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
        ++I)
     Reserved.set(*I);
 
   // Set the frame-pointer register and its aliases as reserved if needed.
   if (TFI->hasFP(MF)) {
     for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
          ++I)
       Reserved.set(*I);
   }
 
   // Set the base-pointer register and its aliases as reserved if needed.
   if (hasBasePointer(MF)) {
     CallingConv::ID CC = MF.getFunction()->getCallingConv();
     const uint32_t* RegMask = getCallPreservedMask(CC);
     if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
       report_fatal_error(
         "Stack realignment in presence of dynamic allocas is not supported with"
         "this calling convention.");
 
     for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true);
          I.isValid(); ++I)
       Reserved.set(*I);
   }
 
   // Mark the segment registers as reserved.
   Reserved.set(X86::CS);
   Reserved.set(X86::SS);
   Reserved.set(X86::DS);
   Reserved.set(X86::ES);
   Reserved.set(X86::FS);
   Reserved.set(X86::GS);
 
   // Mark the floating point stack registers as reserved.
   for (unsigned n = 0; n != 8; ++n)
     Reserved.set(X86::ST0 + n);
 
   // Reserve the registers that only exist in 64-bit mode.
   if (!Is64Bit) {
     // These 8-bit registers are part of the x86-64 extension even though their
     // super-registers are old 32-bits.
     Reserved.set(X86::SIL);
     Reserved.set(X86::DIL);
     Reserved.set(X86::BPL);
     Reserved.set(X86::SPL);
 
     for (unsigned n = 0; n != 8; ++n) {
       // R8, R9, ...
       for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI)
         Reserved.set(*AI);
 
       // XMM8, XMM9, ...
       for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
         Reserved.set(*AI);
     }
   }
   if (!Is64Bit || !Subtarget.hasAVX512()) {
     for (unsigned n = 16; n != 32; ++n) {
       for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
         Reserved.set(*AI);
     }
   }
 
   return Reserved;
 }
 
 //===----------------------------------------------------------------------===//
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
 
 bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
    const MachineFrameInfo *MFI = MF.getFrameInfo();
 
    if (!EnableBasePointer)
      return false;
 
    // When we need stack realignment, we can't address the stack from the frame
    // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
    // can't address variables from the stack pointer.  MS inline asm can
    // reference locals while also adjusting the stack pointer.  When we can't
    // use both the SP and the FP, we need a separate base pointer register.
    bool CantUseFP = needsStackRealignment(MF);
    bool CantUseSP =
        MFI->hasVarSizedObjects() || MFI->hasInlineAsmWithSPAdjust();
    return CantUseFP && CantUseSP;
 }
 
 bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
   if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
     return false;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
 
   // Stack realignment requires a frame pointer.  If we already started
   // register allocation with frame pointer elimination, it is too late now.
   if (!MRI->canReserveReg(FramePtr))
     return false;
 
   // If a base pointer is necessary.  Check that it isn't too late to reserve
   // it.
   if (MFI->hasVarSizedObjects())
     return MRI->canReserveReg(BasePtr);
   return true;
 }
 
 bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
   unsigned StackAlign =
     MF.getSubtarget().getFrameLowering()->getStackAlignment();
   bool requiresRealignment =
     ((MFI->getMaxAlignment() > StackAlign) ||
      F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                      Attribute::StackAlignment));
 
   // If we've requested that we force align the stack do so now.
   if (ForceStackAlign)
     return canRealignStack(MF);
 
   return requiresRealignment && canRealignStack(MF);
 }
 
 bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
                                            unsigned Reg, int &FrameIdx) const {
   // Since X86 defines assignCalleeSavedSpillSlots which always return true
   // this function neither used nor tested.
   llvm_unreachable("Unused function on X86. Otherwise need a test case.");
 }
 
 void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      int SPAdj, unsigned FIOperandNum,
                                      RegScavenger *RS) const {
-  assert(SPAdj == 0 && "Unexpected");
-
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   unsigned BasePtr;
 
   unsigned Opc = MI.getOpcode();
   bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm;
   if (hasBasePointer(MF))
     BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister());
   else if (needsStackRealignment(MF))
     BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
   else if (AfterFPPop)
     BasePtr = StackPtr;
   else
     BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr);
 
   // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit
   // register as source operand, semantic is the same and destination is
   // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided.
   if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr))
     BasePtr = getX86SubSuperRegister(BasePtr, MVT::i64, false);
 
   // This must be part of a four operand memory reference.  Replace the
   // FrameIndex with base register with EBP.  Add an offset to the offset.
   MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
 
   // Now add the frame object offset to the offset from EBP.
   int FIOffset;
   if (AfterFPPop) {
     // Tail call jmp happens after FP is popped.
     const MachineFrameInfo *MFI = MF.getFrameInfo();
     FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea();
   } else
     FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
 
+  if (BasePtr == StackPtr)
+    FIOffset += SPAdj;
+
   // The frame index format for stackmaps and patchpoints is different from the
   // X86 format. It only has a FI and an offset.
   if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
     assert(BasePtr == FramePtr && "Expected the FP as base register");
     int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset;
     MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
     return;
   }
 
   if (MI.getOperand(FIOperandNum+3).isImm()) {
     // Offset is a 32-bit integer.
     int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
     int Offset = FIOffset + Imm;
     assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
            "Requesting 64-bit offset in 32-bit immediate!");
     MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
   } else {
     // Offset is symbolic. This is extremely rare.
     uint64_t Offset = FIOffset +
       (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
     MI.getOperand(FIOperandNum + 3).setOffset(Offset);
   }
 }
 
 unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
 }
 
 unsigned X86RegisterInfo::getPtrSizedFrameRegister(
     const MachineFunction &MF) const {
   unsigned FrameReg = getFrameRegister(MF);
   if (Subtarget.isTarget64BitILP32())
     FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false);
   return FrameReg;
 }
 
 namespace llvm {
 unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
                                 bool High) {
   switch (VT) {
   default: llvm_unreachable("Unexpected VT");
   case MVT::i8:
     if (High) {
       switch (Reg) {
       default: return getX86SubSuperRegister(Reg, MVT::i64);
       case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
         return X86::SI;
       case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
         return X86::DI;
       case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
         return X86::BP;
       case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
         return X86::SP;
       case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
         return X86::AH;
       case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
         return X86::DH;
       case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
         return X86::CH;
       case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
         return X86::BH;
       }
     } else {
       switch (Reg) {
       default: llvm_unreachable("Unexpected register");
       case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
         return X86::AL;
       case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
         return X86::DL;
       case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
         return X86::CL;
       case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
         return X86::BL;
       case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
         return X86::SIL;
       case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
         return X86::DIL;
       case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
         return X86::BPL;
       case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
         return X86::SPL;
       case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
         return X86::R8B;
       case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
         return X86::R9B;
       case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
         return X86::R10B;
       case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
         return X86::R11B;
       case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
         return X86::R12B;
       case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
         return X86::R13B;
       case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
         return X86::R14B;
       case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
         return X86::R15B;
       }
     }
   case MVT::i16:
     switch (Reg) {
     default: llvm_unreachable("Unexpected register");
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::AX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
       return X86::DX;
     case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
       return X86::CX;
     case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
       return X86::BX;
     case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
       return X86::SI;
     case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
       return X86::DI;
     case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
       return X86::BP;
     case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
       return X86::SP;
     case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
       return X86::R8W;
     case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
       return X86::R9W;
     case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
       return X86::R10W;
     case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
       return X86::R11W;
     case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
       return X86::R12W;
     case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
       return X86::R13W;
     case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
       return X86::R14W;
     case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
       return X86::R15W;
     }
   case MVT::i32:
     switch (Reg) {
     default: llvm_unreachable("Unexpected register");
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::EAX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
       return X86::EDX;
     case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
       return X86::ECX;
     case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
       return X86::EBX;
     case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
       return X86::ESI;
     case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
       return X86::EDI;
     case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
       return X86::EBP;
     case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
       return X86::ESP;
     case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
       return X86::R8D;
     case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
       return X86::R9D;
     case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
       return X86::R10D;
     case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
       return X86::R11D;
     case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
       return X86::R12D;
     case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
       return X86::R13D;
     case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
       return X86::R14D;
     case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
       return X86::R15D;
     }
   case MVT::i64:
     switch (Reg) {
     default: llvm_unreachable("Unexpected register");
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::RAX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
       return X86::RDX;
     case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
       return X86::RCX;
     case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
       return X86::RBX;
     case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
       return X86::RSI;
     case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
       return X86::RDI;
     case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
       return X86::RBP;
     case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
       return X86::RSP;
     case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
       return X86::R8;
     case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
       return X86::R9;
     case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
       return X86::R10;
     case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
       return X86::R11;
     case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
       return X86::R12;
     case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
       return X86::R13;
     case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
       return X86::R14;
     case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
       return X86::R15;
     }
   }
 }
 
 unsigned get512BitSuperRegister(unsigned Reg) {
   if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
     return X86::ZMM0 + (Reg - X86::XMM0);
   if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
     return X86::ZMM0 + (Reg - X86::YMM0);
   if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31)
     return Reg;
   llvm_unreachable("Unexpected SIMD register");
 }
 
 }
diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
index 5e6aa7d3dbf4..1fc6b20eab09 100644
--- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -1,205 +1,210 @@
 //===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This file defines the X86 specific subclass of TargetMachine.
 //
 //===----------------------------------------------------------------------===//
 
 #include "X86TargetMachine.h"
 #include "X86.h"
 #include "X86TargetObjectFile.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
   RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
   RegisterTargetMachine<X86TargetMachine> Y(TheX86_64Target);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO()) {
     if (TT.getArch() == Triple::x86_64)
       return make_unique<X86_64MachoTargetObjectFile>();
     return make_unique<TargetLoweringObjectFileMachO>();
   }
 
   if (TT.isOSLinux())
     return make_unique<X86LinuxTargetObjectFile>();
   if (TT.isOSBinFormatELF())
     return make_unique<TargetLoweringObjectFileELF>();
   if (TT.isKnownWindowsMSVCEnvironment())
     return make_unique<X86WindowsTargetObjectFile>();
   if (TT.isOSBinFormatCOFF())
     return make_unique<TargetLoweringObjectFileCOFF>();
   llvm_unreachable("unknown subtarget type");
 }
 
 /// X86TargetMachine ctor - Create an X86 target.
 ///
 X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU,
                                    StringRef FS, const TargetOptions &Options,
                                    Reloc::Model RM, CodeModel::Model CM,
                                    CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
       TLOF(createTLOF(Triple(getTargetTriple()))),
       Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) {
   // default to hard float ABI
   if (Options.FloatABIType == FloatABI::Default)
     this->Options.FloatABIType = FloatABI::Hard;
 
   // Windows stack unwinder gets confused when execution flow "falls through"
   // after a call to 'noreturn' function.
   // To prevent that, we emit a trap for 'unreachable' IR instructions.
   // (which on X86, happens to be the 'ud2' instruction)
   if (Subtarget.isTargetWin64())
     this->Options.TrapUnreachable = true;
 
   initAsmInfo();
 }
 
 X86TargetMachine::~X86TargetMachine() {}
 
 const X86Subtarget *
 X86TargetMachine::getSubtargetImpl(const Function &F) const {
   AttributeSet FnAttrs = F.getAttributes();
   Attribute CPUAttr =
       FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
   Attribute FSAttr =
       FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
                         : TargetCPU;
   std::string FS = !FSAttr.hasAttribute(Attribute::None)
                        ? FSAttr.getValueAsString().str()
                        : TargetFS;
 
   // FIXME: This is related to the code below to reset the target options,
   // we need to know whether or not the soft float flag is set on the
   // function before we can generate a subtarget. We also need to use
   // it as a key for the subtarget since that can be the only difference
   // between two functions.
   Attribute SFAttr =
       FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float");
   bool SoftFloat = !SFAttr.hasAttribute(Attribute::None)
                        ? SFAttr.getValueAsString() == "true"
                        : Options.UseSoftFloat;
 
   auto &I = SubtargetMap[CPU + FS + (SoftFloat ? "use-soft-float=true"
                                                : "use-soft-float=false")];
   if (!I) {
     // This needs to be done before we create a new subtarget since any
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
     I = llvm::make_unique<X86Subtarget>(TargetTriple, CPU, FS, *this,
                                         Options.StackAlignmentOverride);
   }
   return I.get();
 }
 
 //===----------------------------------------------------------------------===//
 // Command line options for x86
 //===----------------------------------------------------------------------===//
 static cl::opt<bool>
 UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
   cl::desc("Minimize AVX to SSE transition penalty"),
   cl::init(true));
 
 //===----------------------------------------------------------------------===//
 // X86 Analysis Pass Setup
 //===----------------------------------------------------------------------===//
 
 void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
   // Add first the target-independent BasicTTI pass, then our X86 pass. This
   // allows the X86 pass to delegate to the target independent layer when
   // appropriate.
   PM.add(createBasicTargetTransformInfoPass(this));
   PM.add(createX86TargetTransformInfoPass(this));
 }
 
 
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
 
 namespace {
 /// X86 Code Generator Pass Configuration Options.
 class X86PassConfig : public TargetPassConfig {
 public:
   X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {}
 
   X86TargetMachine &getX86TargetMachine() const {
     return getTM<X86TargetMachine>();
   }
 
   const X86Subtarget &getX86Subtarget() const {
     return *getX86TargetMachine().getSubtargetImpl();
   }
 
   void addIRPasses() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
+  void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   void addPreEmitPass() override;
 };
 } // namespace
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new X86PassConfig(this, PM);
 }
 
 void X86PassConfig::addIRPasses() {
   addPass(createAtomicExpandPass(&getX86TargetMachine()));
 
   TargetPassConfig::addIRPasses();
 }
 
 bool X86PassConfig::addInstSelector() {
   // Install an instruction selector.
   addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel()));
 
   // For ELF, cleanup any local-dynamic TLS accesses.
   if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
     addPass(createCleanupLocalDynamicTLSPass());
 
   addPass(createX86GlobalBaseRegPass());
 
   return false;
 }
 
 bool X86PassConfig::addILPOpts() {
   addPass(&EarlyIfConverterID);
   return true;
 }
 
+void X86PassConfig::addPreRegAlloc() {
+  addPass(createX86CallFrameOptimization());
+}
+
 void X86PassConfig::addPostRegAlloc() {
   addPass(createX86FloatingPointStackifierPass());
 }
 
 void X86PassConfig::addPreEmitPass() {
   if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2())
     addPass(createExecutionDependencyFixPass(&X86::VR128RegClass));
 
   if (UseVZeroUpper)
     addPass(createX86IssueVZeroUpperPass());
 
   if (getOptLevel() != CodeGenOpt::None) {
     addPass(createX86PadShortFunctions());
     addPass(createX86FixupLEAs());
   }
 }
diff --git a/lib/clang/libllvmx86codegen/Makefile b/lib/clang/libllvmx86codegen/Makefile
index 58278521a7a2..b030b1fc2e98 100644
--- a/lib/clang/libllvmx86codegen/Makefile
+++ b/lib/clang/libllvmx86codegen/Makefile
@@ -1,35 +1,36 @@
 # $FreeBSD$
 
 .include <bsd.own.mk>
 
 LIB=	llvmx86codegen
 
 SRCDIR=	lib/Target/X86
 SRCS=	X86AsmPrinter.cpp \
+	X86CallFrameOptimization.cpp \
 	X86FastISel.cpp \
 	X86FixupLEAs.cpp \
 	X86FloatingPoint.cpp \
 	X86FrameLowering.cpp \
 	X86ISelDAGToDAG.cpp \
 	X86ISelLowering.cpp \
 	X86InstrInfo.cpp \
 	X86MCInstLower.cpp \
 	X86MachineFunctionInfo.cpp \
 	X86PadShortFunction.cpp \
 	X86RegisterInfo.cpp \
 	X86SelectionDAGInfo.cpp \
 	X86Subtarget.cpp \
 	X86TargetMachine.cpp \
 	X86TargetObjectFile.cpp \
 	X86TargetTransformInfo.cpp \
 	X86VZeroUpper.cpp
 
 TGHDRS=	Intrinsics \
 	X86GenCallingConv \
 	X86GenDAGISel \
 	X86GenFastISel \
 	X86GenInstrInfo \
 	X86GenRegisterInfo \
 	X86GenSubtargetInfo
 
 .include "../clang.lib.mk"