diff --git a/contrib/llvm/include/llvm/Target/TargetFrameLowering.h b/contrib/llvm/include/llvm/Target/TargetFrameLowering.h index 277bd98d371c..f17640f71e93 100644 --- a/contrib/llvm/include/llvm/Target/TargetFrameLowering.h +++ b/contrib/llvm/include/llvm/Target/TargetFrameLowering.h @@ -1,251 +1,256 @@ //===-- llvm/Target/TargetFrameLowering.h ---------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // Interface to describe the layout of a stack frame on the target machine. // //===----------------------------------------------------------------------===// #ifndef LLVM_TARGET_TARGETFRAMELOWERING_H #define LLVM_TARGET_TARGETFRAMELOWERING_H #include "llvm/CodeGen/MachineBasicBlock.h" #include #include namespace llvm { class CalleeSavedInfo; class MachineFunction; class RegScavenger; /// Information about stack frame layout on the target. It holds the direction /// of stack growth, the known stack alignment on entry to each function, and /// the offset to the locals area. /// /// The offset to the local area is the offset from the stack pointer on /// function entry to the first location where function data (local variables, /// spill locations) can be stored. class TargetFrameLowering { public: enum StackDirection { StackGrowsUp, // Adding to the stack increases the stack address StackGrowsDown // Adding to the stack decreases the stack address }; // Maps a callee saved register to a stack slot with a fixed offset. struct SpillSlot { unsigned Reg; int Offset; // Offset relative to stack pointer on function entry. }; private: StackDirection StackDir; unsigned StackAlignment; unsigned TransientStackAlignment; int LocalAreaOffset; bool StackRealignable; public: TargetFrameLowering(StackDirection D, unsigned StackAl, int LAO, unsigned TransAl = 1, bool StackReal = true) : StackDir(D), StackAlignment(StackAl), TransientStackAlignment(TransAl), LocalAreaOffset(LAO), StackRealignable(StackReal) {} virtual ~TargetFrameLowering(); // These methods return information that describes the abstract stack layout // of the target machine. /// getStackGrowthDirection - Return the direction the stack grows /// StackDirection getStackGrowthDirection() const { return StackDir; } /// getStackAlignment - This method returns the number of bytes to which the /// stack pointer must be aligned on entry to a function. Typically, this /// is the largest alignment for any data object in the target. /// unsigned getStackAlignment() const { return StackAlignment; } /// getTransientStackAlignment - This method returns the number of bytes to /// which the stack pointer must be aligned at all times, even between /// calls. /// unsigned getTransientStackAlignment() const { return TransientStackAlignment; } /// isStackRealignable - This method returns whether the stack can be /// realigned. bool isStackRealignable() const { return StackRealignable; } /// getOffsetOfLocalArea - This method returns the offset of the local area /// from the stack pointer on entrance to a function. /// int getOffsetOfLocalArea() const { return LocalAreaOffset; } /// isFPCloseToIncomingSP - Return true if the frame pointer is close to /// the incoming stack pointer, false if it is close to the post-prologue /// stack pointer. virtual bool isFPCloseToIncomingSP() const { return true; } /// assignCalleeSavedSpillSlots - Allows target to override spill slot /// assignment logic. If implemented, assignCalleeSavedSpillSlots() should /// assign frame slots to all CSI entries and return true. If this method /// returns false, spill slots will be assigned using generic implementation. /// assignCalleeSavedSpillSlots() may add, delete or rearrange elements of /// CSI. virtual bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector &CSI) const { return false; } /// getCalleeSavedSpillSlots - This method returns a pointer to an array of /// pairs, that contains an entry for each callee saved register that must be /// spilled to a particular stack location if it is spilled. /// /// Each entry in this array contains a pair, indicating the /// fixed offset from the incoming stack pointer that each register should be /// spilled at. If a register is not listed here, the code generator is /// allowed to spill it anywhere it chooses. /// virtual const SpillSlot * getCalleeSavedSpillSlots(unsigned &NumEntries) const { NumEntries = 0; return nullptr; } /// targetHandlesStackFrameRounding - Returns true if the target is /// responsible for rounding up the stack frame (probably at emitPrologue /// time). virtual bool targetHandlesStackFrameRounding() const { return false; } /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. virtual void emitPrologue(MachineFunction &MF) const = 0; virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const = 0; /// Adjust the prologue to have the function use segmented stacks. This works /// by adding a check even before the "normal" function prologue. virtual void adjustForSegmentedStacks(MachineFunction &MF) const { } /// Adjust the prologue to add Erlang Run-Time System (ERTS) specific code in /// the assembly prologue to explicitly handle the stack. virtual void adjustForHiPEPrologue(MachineFunction &MF) const { } /// Adjust the prologue to add an allocation at a fixed offset from the frame /// pointer. virtual void adjustForFrameAllocatePrologue(MachineFunction &MF) const { } /// spillCalleeSavedRegisters - Issues instruction(s) to spill all callee /// saved registers and returns true if it isn't possible / profitable to do /// so by issuing a series of store instructions via /// storeRegToStackSlot(). Returns false otherwise. virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { return false; } /// restoreCalleeSavedRegisters - Issues instruction(s) to restore all callee /// saved registers and returns true if it isn't possible / profitable to do /// so by issuing a series of load instructions via loadRegToStackSlot(). /// Returns false otherwise. virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { return false; } /// hasFP - Return true if the specified function should have a dedicated /// frame pointer register. For most targets this is true only if the function /// has variable sized allocas or if frame pointer elimination is disabled. virtual bool hasFP(const MachineFunction &MF) const = 0; /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is /// not required, we reserve argument space for call sites in the function /// immediately on entry to the current function. This eliminates the need for /// add/sub sp brackets around call sites. Returns true if the call frame is /// included as part of the stack frame. virtual bool hasReservedCallFrame(const MachineFunction &MF) const { return !hasFP(MF); } /// canSimplifyCallFramePseudos - When possible, it's best to simplify the /// call frame pseudo ops before doing frame index elimination. This is /// possible only when frame index references between the pseudos won't /// need adjusting for the call frame adjustments. Normally, that's true /// if the function has a reserved call frame or a frame pointer. Some /// targets (Thumb2, for example) may have more complicated criteria, /// however, and can override this behavior. virtual bool canSimplifyCallFramePseudos(const MachineFunction &MF) const { return hasReservedCallFrame(MF) || hasFP(MF); } + // needsFrameIndexResolution - Do we need to perform FI resolution for + // this function. Normally, this is required only when the function + // has any stack objects. However, targets may want to override this. + virtual bool needsFrameIndexResolution(const MachineFunction &MF) const; + /// getFrameIndexOffset - Returns the displacement from the frame register to /// the stack frame of the specified index. virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const; /// getFrameIndexReference - This method should return the base register /// and offset used to reference a frame index location. The offset is /// returned directly, and the base register is returned via FrameReg. virtual int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const; /// Same as above, except that the 'base register' will always be RSP, not /// RBP on x86. This is used exclusively for lowering STATEPOINT nodes. /// TODO: This should really be a parameterizable choice. virtual int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, unsigned &FrameReg) const { // default to calling normal version, we override this on x86 only llvm_unreachable("unimplemented for non-x86"); return 0; } /// processFunctionBeforeCalleeSavedScan - This method is called immediately /// before PrologEpilogInserter scans the physical registers used to determine /// what callee saved registers should be spilled. This method is optional. virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS = nullptr) const { } /// processFunctionBeforeFrameFinalized - This method is called immediately /// before the specified function's frame layout (MF.getFrameInfo()) is /// finalized. Once the frame is finalized, MO_FrameIndex operands are /// replaced with direct constants. This method is optional. /// virtual void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS = nullptr) const { } /// eliminateCallFramePseudoInstr - This method is called during prolog/epilog /// code insertion to eliminate call frame setup and destroy pseudo /// instructions (but only if the Target is using them). It is responsible /// for eliminating these instructions, replacing them with concrete /// instructions. This method need only be implemented if using call frame /// setup/destroy pseudo instructions. /// virtual void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const { llvm_unreachable("Call Frame Pseudo Instructions do not exist on this " "target!"); } }; } // End llvm namespace #endif diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp index 385e5a35afba..61407faaf327 100644 --- a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -1,957 +1,961 @@ //===-- PrologEpilogInserter.cpp - Insert Prolog/Epilog code in function --===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This pass is responsible for finalizing the functions frame layout, saving // callee saved registers, and for emitting prolog & epilog code for the // function. // // This pass must be run after register allocation. After this pass is // executed, it is illegal to construct MO_FrameIndex operands. // //===----------------------------------------------------------------------===// #include "PrologEpilogInserter.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/InlineAsm.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include using namespace llvm; #define DEBUG_TYPE "pei" char PEI::ID = 0; char &llvm::PrologEpilogCodeInserterID = PEI::ID; static cl::opt WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1), cl::desc("Warn for stack size bigger than the given" " number")); INITIALIZE_PASS_BEGIN(PEI, "prologepilog", "Prologue/Epilogue Insertion", false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(StackProtector) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(PEI, "prologepilog", "Prologue/Epilogue Insertion & Frame Finalization", false, false) STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged"); STATISTIC(NumBytesStackSpace, "Number of bytes used for stack in all functions"); void PEI::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addPreserved(); AU.addPreserved(); AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } bool PEI::isReturnBlock(MachineBasicBlock* MBB) { return (MBB && !MBB->empty() && MBB->back().isReturn()); } /// Compute the set of return blocks void PEI::calculateSets(MachineFunction &Fn) { // Sets used to compute spill, restore placement sets. const std::vector &CSI = Fn.getFrameInfo()->getCalleeSavedInfo(); // If no CSRs used, we are done. if (CSI.empty()) return; // Save refs to entry and return blocks. EntryBlock = Fn.begin(); for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end(); MBB != E; ++MBB) if (isReturnBlock(MBB)) ReturnBlocks.push_back(MBB); return; } /// StackObjSet - A set of stack object indexes typedef SmallSetVector StackObjSet; /// runOnMachineFunction - Insert prolog/epilog code and replace abstract /// frame indexes with appropriate references. /// bool PEI::runOnMachineFunction(MachineFunction &Fn) { const Function* F = Fn.getFunction(); const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); assert(!Fn.getRegInfo().getNumVirtRegs() && "Regalloc must assign all vregs"); RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr; FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn); // Calculate the MaxCallFrameSize and AdjustsStack variables for the // function's frame information. Also eliminates call frame pseudo // instructions. calculateCallsInformation(Fn); // Allow the target machine to make some adjustments to the function // e.g. UsedPhysRegs before calculateCalleeSavedRegisters. TFI->processFunctionBeforeCalleeSavedScan(Fn, RS); // Scan the function for modified callee saved registers and insert spill code // for any callee saved registers that are modified. calculateCalleeSavedRegisters(Fn); // Determine placement of CSR spill/restore code: // place all spills in the entry block, all restores in return blocks. calculateSets(Fn); // Add the code to save and restore the callee saved registers if (!F->hasFnAttribute(Attribute::Naked)) insertCSRSpillsAndRestores(Fn); // Allow the target machine to make final modifications to the function // before the frame layout is finalized. TFI->processFunctionBeforeFrameFinalized(Fn, RS); // Calculate actual frame offsets for all abstract stack objects... calculateFrameObjectOffsets(Fn); // Add prolog and epilog code to the function. This function is required // to align the stack frame as necessary for any stack variables or // called functions. Because of this, calculateCalleeSavedRegisters() // must be called before this function in order to set the AdjustsStack // and MaxCallFrameSize variables. if (!F->hasFnAttribute(Attribute::Naked)) insertPrologEpilogCode(Fn); // Replace all MO_FrameIndex operands with physical register references // and actual offsets. // replaceFrameIndices(Fn); // If register scavenging is needed, as we've enabled doing it as a // post-pass, scavenge the virtual registers that frame index elimination // inserted. if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) scavengeFrameVirtualRegs(Fn); // Clear any vregs created by virtual scavenging. Fn.getRegInfo().clearVirtRegs(); // Warn on stack size when we exceeds the given limit. MachineFrameInfo *MFI = Fn.getFrameInfo(); uint64_t StackSize = MFI->getStackSize(); if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) { DiagnosticInfoStackSize DiagStackSize(*F, StackSize); F->getContext().diagnose(DiagStackSize); } delete RS; ReturnBlocks.clear(); return true; } /// calculateCallsInformation - Calculate the MaxCallFrameSize and AdjustsStack /// variables for the function's frame information and eliminate call frame /// pseudo instructions. void PEI::calculateCallsInformation(MachineFunction &Fn) { const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); MachineFrameInfo *MFI = Fn.getFrameInfo(); unsigned MaxCallFrameSize = 0; bool AdjustsStack = MFI->adjustsStack(); // Get the function call frame set-up and tear-down instruction opcode int FrameSetupOpcode = TII.getCallFrameSetupOpcode(); int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); // Early exit for targets which have no call frame setup/destroy pseudo // instructions. if (FrameSetupOpcode == -1 && FrameDestroyOpcode == -1) return; std::vector FrameSDOps; for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) if (I->getOpcode() == FrameSetupOpcode || I->getOpcode() == FrameDestroyOpcode) { assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo" " instructions should have a single immediate argument!"); unsigned Size = I->getOperand(0).getImm(); if (Size > MaxCallFrameSize) MaxCallFrameSize = Size; AdjustsStack = true; FrameSDOps.push_back(I); } else if (I->isInlineAsm()) { // Some inline asm's need a stack frame, as indicated by operand 1. unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm(); if (ExtraInfo & InlineAsm::Extra_IsAlignStack) AdjustsStack = true; } MFI->setAdjustsStack(AdjustsStack); MFI->setMaxCallFrameSize(MaxCallFrameSize); for (std::vector::iterator i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) { MachineBasicBlock::iterator I = *i; // If call frames are not being included as part of the stack frame, and // the target doesn't indicate otherwise, remove the call frame pseudos // here. The sub/add sp instruction pairs are still inserted, but we don't // need to track the SP adjustment for frame index elimination. if (TFI->canSimplifyCallFramePseudos(Fn)) TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I); } } /// calculateCalleeSavedRegisters - Scan the function for modified callee saved /// registers. void PEI::calculateCalleeSavedRegisters(MachineFunction &F) { const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering(); MachineFrameInfo *MFI = F.getFrameInfo(); // Get the callee saved register list... const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F); // These are used to keep track the callee-save area. Initialize them. MinCSFrameIndex = INT_MAX; MaxCSFrameIndex = 0; // Early exit for targets which have no callee saved registers. if (!CSRegs || CSRegs[0] == 0) return; // In Naked functions we aren't going to save any registers. if (F.getFunction()->hasFnAttribute(Attribute::Naked)) return; std::vector CSI; for (unsigned i = 0; CSRegs[i]; ++i) { unsigned Reg = CSRegs[i]; // Functions which call __builtin_unwind_init get all their registers saved. if (F.getRegInfo().isPhysRegUsed(Reg) || F.getMMI().callsUnwindInit()) { // If the reg is modified, save it! CSI.push_back(CalleeSavedInfo(Reg)); } } if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) { // If target doesn't implement this, use generic code. if (CSI.empty()) return; // Early exit if no callee saved registers are modified! unsigned NumFixedSpillSlots; const TargetFrameLowering::SpillSlot *FixedSpillSlots = TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots); // Now that we know which registers need to be saved and restored, allocate // stack slots for them. for (std::vector::iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { unsigned Reg = I->getReg(); const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); int FrameIdx; if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) { I->setFrameIdx(FrameIdx); continue; } // Check to see if this physreg must be spilled to a particular stack slot // on this target. const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots; while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots && FixedSlot->Reg != Reg) ++FixedSlot; if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) { // Nope, just spill it anywhere convenient. unsigned Align = RC->getAlignment(); unsigned StackAlign = TFI->getStackAlignment(); // We may not be able to satisfy the desired alignment specification of // the TargetRegisterClass if the stack alignment is smaller. Use the // min. Align = std::min(Align, StackAlign); FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true); if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; } else { // Spill it to the stack where we must. FrameIdx = MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset); } I->setFrameIdx(FrameIdx); } } MFI->setCalleeSavedInfo(CSI); } /// insertCSRSpillsAndRestores - Insert spill and restore code for /// callee saved registers used in the function. /// void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) { // Get callee saved register information. MachineFrameInfo *MFI = Fn.getFrameInfo(); const std::vector &CSI = MFI->getCalleeSavedInfo(); MFI->setCalleeSavedInfoValid(true); // Early exit if no callee saved registers are modified! if (CSI.empty()) return; const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo(); MachineBasicBlock::iterator I; // Spill using target interface. I = EntryBlock->begin(); if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) { for (unsigned i = 0, e = CSI.size(); i != e; ++i) { // Add the callee-saved register as live-in. // It's killed at the spill. EntryBlock->addLiveIn(CSI[i].getReg()); // Insert the spill to the stack frame. unsigned Reg = CSI[i].getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.storeRegToStackSlot(*EntryBlock, I, Reg, true, CSI[i].getFrameIdx(), RC, TRI); } } // Restore using target interface. for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) { MachineBasicBlock *MBB = ReturnBlocks[ri]; I = MBB->end(); --I; // Skip over all terminator instructions, which are part of the return // sequence. MachineBasicBlock::iterator I2 = I; while (I2 != MBB->begin() && (--I2)->isTerminator()) I = I2; bool AtStart = I == MBB->begin(); MachineBasicBlock::iterator BeforeI = I; if (!AtStart) --BeforeI; // Restore all registers immediately before the return and any // terminators that precede it. if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) { for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI); assert(I != MBB->begin() && "loadRegFromStackSlot didn't insert any code!"); // Insert in reverse order. loadRegFromStackSlot can insert // multiple instructions. if (AtStart) I = MBB->begin(); else { I = BeforeI; ++I; } } } } } /// AdjustStackOffset - Helper function used to adjust the stack frame offset. static inline void AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx, bool StackGrowsDown, int64_t &Offset, unsigned &MaxAlign) { // If the stack grows down, add the object size to find the lowest address. if (StackGrowsDown) Offset += MFI->getObjectSize(FrameIdx); unsigned Align = MFI->getObjectAlignment(FrameIdx); // If the alignment of this object is greater than that of the stack, then // increase the stack alignment to match. MaxAlign = std::max(MaxAlign, Align); // Adjust to alignment boundary. Offset = (Offset + Align - 1) / Align * Align; if (StackGrowsDown) { DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n"); MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset } else { DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n"); MFI->setObjectOffset(FrameIdx, Offset); Offset += MFI->getObjectSize(FrameIdx); } } /// AssignProtectedObjSet - Helper function to assign large stack objects (i.e., /// those required to be close to the Stack Protector) to stack offsets. static void AssignProtectedObjSet(const StackObjSet &UnassignedObjs, SmallSet &ProtectedObjs, MachineFrameInfo *MFI, bool StackGrowsDown, int64_t &Offset, unsigned &MaxAlign) { for (StackObjSet::const_iterator I = UnassignedObjs.begin(), E = UnassignedObjs.end(); I != E; ++I) { int i = *I; AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign); ProtectedObjs.insert(i); } } /// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the /// abstract stack objects. /// void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) { const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); StackProtector *SP = &getAnalysis(); bool StackGrowsDown = TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown; // Loop over all of the stack objects, assigning sequential addresses... MachineFrameInfo *MFI = Fn.getFrameInfo(); // Start at the beginning of the local area. // The Offset is the distance from the stack top in the direction // of stack growth -- so it's always nonnegative. int LocalAreaOffset = TFI.getOffsetOfLocalArea(); if (StackGrowsDown) LocalAreaOffset = -LocalAreaOffset; assert(LocalAreaOffset >= 0 && "Local area offset should be in direction of stack growth"); int64_t Offset = LocalAreaOffset; // If there are fixed sized objects that are preallocated in the local area, // non-fixed objects can't be allocated right at the start of local area. // We currently don't support filling in holes in between fixed sized // objects, so we adjust 'Offset' to point to the end of last fixed sized // preallocated object. for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) { int64_t FixedOff; if (StackGrowsDown) { // The maximum distance from the stack pointer is at lower address of // the object -- which is given by offset. For down growing stack // the offset is negative, so we negate the offset to get the distance. FixedOff = -MFI->getObjectOffset(i); } else { // The maximum distance from the start pointer is at the upper // address of the object. FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i); } if (FixedOff > Offset) Offset = FixedOff; } // First assign frame offsets to stack objects that are used to spill // callee saved registers. if (StackGrowsDown) { for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) { // If the stack grows down, we need to add the size to find the lowest // address of the object. Offset += MFI->getObjectSize(i); unsigned Align = MFI->getObjectAlignment(i); // Adjust to alignment boundary Offset = (Offset+Align-1)/Align*Align; MFI->setObjectOffset(i, -Offset); // Set the computed offset } } else { int MaxCSFI = MaxCSFrameIndex, MinCSFI = MinCSFrameIndex; for (int i = MaxCSFI; i >= MinCSFI ; --i) { unsigned Align = MFI->getObjectAlignment(i); // Adjust to alignment boundary Offset = (Offset+Align-1)/Align*Align; MFI->setObjectOffset(i, Offset); Offset += MFI->getObjectSize(i); } } unsigned MaxAlign = MFI->getMaxAlignment(); // Make sure the special register scavenging spill slot is closest to the // incoming stack pointer if a frame pointer is required and is closer // to the incoming rather than the final stack pointer. const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo(); bool EarlyScavengingSlots = (TFI.hasFP(Fn) && TFI.isFPCloseToIncomingSP() && RegInfo->useFPForScavengingIndex(Fn) && !RegInfo->needsStackRealignment(Fn)); if (RS && EarlyScavengingSlots) { SmallVector SFIs; RS->getScavengingFrameIndices(SFIs); for (SmallVectorImpl::iterator I = SFIs.begin(), IE = SFIs.end(); I != IE; ++I) AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign); } // FIXME: Once this is working, then enable flag will change to a target // check for whether the frame is large enough to want to use virtual // frame index registers. Functions which don't want/need this optimization // will continue to use the existing code path. if (MFI->getUseLocalStackAllocationBlock()) { unsigned Align = MFI->getLocalFrameMaxAlign(); // Adjust to alignment boundary. Offset = (Offset + Align - 1) / Align * Align; DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n"); // Resolve offsets for objects in the local block. for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) { std::pair Entry = MFI->getLocalFrameObjectMap(i); int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second; DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" << FIOffset << "]\n"); MFI->setObjectOffset(Entry.first, FIOffset); } // Allocate the local block Offset += MFI->getLocalFrameSize(); MaxAlign = std::max(Align, MaxAlign); } // Make sure that the stack protector comes before the local variables on the // stack. SmallSet ProtectedObjs; if (MFI->getStackProtectorIndex() >= 0) { StackObjSet LargeArrayObjs; StackObjSet SmallArrayObjs; StackObjSet AddrOfObjs; AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown, Offset, MaxAlign); // Assign large stack objects first. for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { if (MFI->isObjectPreAllocated(i) && MFI->getUseLocalStackAllocationBlock()) continue; if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex) continue; if (RS && RS->isScavengingFrameIndex((int)i)) continue; if (MFI->isDeadObjectIndex(i)) continue; if (MFI->getStackProtectorIndex() == (int)i) continue; switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) { case StackProtector::SSPLK_None: continue; case StackProtector::SSPLK_SmallArray: SmallArrayObjs.insert(i); continue; case StackProtector::SSPLK_AddrOf: AddrOfObjs.insert(i); continue; case StackProtector::SSPLK_LargeArray: LargeArrayObjs.insert(i); continue; } llvm_unreachable("Unexpected SSPLayoutKind."); } AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown, Offset, MaxAlign); AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown, Offset, MaxAlign); AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown, Offset, MaxAlign); } // Then assign frame offsets to stack objects that are not used to spill // callee saved registers. for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { if (MFI->isObjectPreAllocated(i) && MFI->getUseLocalStackAllocationBlock()) continue; if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex) continue; if (RS && RS->isScavengingFrameIndex((int)i)) continue; if (MFI->isDeadObjectIndex(i)) continue; if (MFI->getStackProtectorIndex() == (int)i) continue; if (ProtectedObjs.count(i)) continue; AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign); } // Make sure the special register scavenging spill slot is closest to the // stack pointer. if (RS && !EarlyScavengingSlots) { SmallVector SFIs; RS->getScavengingFrameIndices(SFIs); for (SmallVectorImpl::iterator I = SFIs.begin(), IE = SFIs.end(); I != IE; ++I) AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign); } if (!TFI.targetHandlesStackFrameRounding()) { // If we have reserved argument space for call sites in the function // immediately on entry to the current function, count it as part of the // overall stack size. if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn)) Offset += MFI->getMaxCallFrameSize(); // Round up the size to a multiple of the alignment. If the function has // any calls or alloca's, align to the target's StackAlignment value to // ensure that the callee's frame or the alloca data is suitably aligned; // otherwise, for leaf functions, align to the TransientStackAlignment // value. unsigned StackAlign; if (MFI->adjustsStack() || MFI->hasVarSizedObjects() || (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0)) StackAlign = TFI.getStackAlignment(); else StackAlign = TFI.getTransientStackAlignment(); // If the frame pointer is eliminated, all frame offsets will be relative to // SP not FP. Align to MaxAlign so this works. StackAlign = std::max(StackAlign, MaxAlign); unsigned AlignMask = StackAlign - 1; Offset = (Offset + AlignMask) & ~uint64_t(AlignMask); } // Update frame info to pretend that this is part of the stack... int64_t StackSize = Offset - LocalAreaOffset; MFI->setStackSize(StackSize); NumBytesStackSpace += StackSize; } /// insertPrologEpilogCode - Scan the function for modified callee saved /// registers, insert spill code for these callee saved registers, then add /// prolog and epilog code to the function. /// void PEI::insertPrologEpilogCode(MachineFunction &Fn) { const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); // Add prologue to the function... TFI.emitPrologue(Fn); // Add epilogue to restore the callee-save registers in each exiting block for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) { // If last instruction is a return instruction, add an epilogue if (!I->empty() && I->back().isReturn()) TFI.emitEpilogue(Fn, *I); } // Emit additional code that is required to support segmented stacks, if // we've been asked for it. This, when linked with a runtime with support // for segmented stacks (libgcc is one), will result in allocating stack // space in small chunks instead of one large contiguous block. if (Fn.shouldSplitStack()) TFI.adjustForSegmentedStacks(Fn); // Emit additional code that is required to explicitly handle the stack in // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The // approach is rather similar to that of Segmented Stacks, but it uses a // different conditional check and another BIF for allocating more stack // space. if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE) TFI.adjustForHiPEPrologue(Fn); } /// replaceFrameIndices - Replace all MO_FrameIndex operands with physical /// register references and actual offsets. /// void PEI::replaceFrameIndices(MachineFunction &Fn) { - if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do? + const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering(); + if (!TFI.needsFrameIndexResolution(Fn)) return; // Store SPAdj at exit of a basic block. SmallVector SPState; SPState.resize(Fn.getNumBlockIDs()); SmallPtrSet Reachable; // Iterate over the reachable blocks in DFS order. for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable); DFI != DFE; ++DFI) { int SPAdj = 0; // Check the exit state of the DFS stack predecessor. if (DFI.getPathLength() >= 2) { MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2); assert(Reachable.count(StackPred) && "DFS stack predecessor is already visited.\n"); SPAdj = SPState[StackPred->getNumber()]; } MachineBasicBlock *BB = *DFI; replaceFrameIndices(BB, Fn, SPAdj); SPState[BB->getNumber()] = SPAdj; } // Handle the unreachable blocks. for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { if (Reachable.count(BB)) // Already handled in DFS traversal. continue; int SPAdj = 0; replaceFrameIndices(BB, Fn, SPAdj); } } void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn, int &SPAdj) { assert(Fn.getSubtarget().getRegisterInfo() && "getRegisterInfo() must be implemented!"); const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo(); const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo(); const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering(); int FrameSetupOpcode = TII.getCallFrameSetupOpcode(); int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode(); if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB); bool InsideCallSequence = false; for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { if (I->getOpcode() == FrameSetupOpcode || I->getOpcode() == FrameDestroyOpcode) { InsideCallSequence = (I->getOpcode() == FrameSetupOpcode); SPAdj += TII.getSPAdjust(I); MachineBasicBlock::iterator PrevI = BB->end(); if (I != BB->begin()) PrevI = std::prev(I); TFI->eliminateCallFramePseudoInstr(Fn, *BB, I); // Visit the instructions created by eliminateCallFramePseudoInstr(). if (PrevI == BB->end()) I = BB->begin(); // The replaced instr was the first in the block. else I = std::next(PrevI); continue; } - // If we are looking at a call sequence, we need to keep track of - // the SP adjustment made by each instruction in the sequence. - // This includes both the frame setup/destroy pseudos (handled above), - // as well as other instructions that have side effects w.r.t the SP. - if (InsideCallSequence) - SPAdj += TII.getSPAdjust(I); - MachineInstr *MI = I; bool DoIncr = true; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { if (!MI->getOperand(i).isFI()) continue; // Frame indicies in debug values are encoded in a target independent // way with simply the frame index and offset rather than any // target-specific addressing mode. if (MI->isDebugValue()) { assert(i == 0 && "Frame indicies can only appear as the first " "operand of a DBG_VALUE machine instruction"); unsigned Reg; MachineOperand &Offset = MI->getOperand(1); Offset.setImm(Offset.getImm() + TFI->getFrameIndexReference( Fn, MI->getOperand(0).getIndex(), Reg)); MI->getOperand(0).ChangeToRegister(Reg, false /*isDef*/); continue; } // TODO: This code should be commoned with the code for // PATCHPOINT. There's no good reason for the difference in // implementation other than historical accident. The only // remaining difference is the unconditional use of the stack // pointer as the base register. if (MI->getOpcode() == TargetOpcode::STATEPOINT) { assert((!MI->isDebugValue() || i == 0) && "Frame indicies can only appear as the first operand of a " "DBG_VALUE machine instruction"); unsigned Reg; MachineOperand &Offset = MI->getOperand(i + 1); const unsigned refOffset = TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(), Reg); Offset.setImm(Offset.getImm() + refOffset); MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/); continue; } // Frame allocations are target independent. Simply swap the index with // the offset. if (MI->getOpcode() == TargetOpcode::FRAME_ALLOC) { assert(TFI->hasFP(Fn) && "frame alloc requires FP"); MachineOperand &FI = MI->getOperand(i); unsigned Reg; int FrameOffset = TFI->getFrameIndexReference(Fn, FI.getIndex(), Reg); FI.ChangeToImmediate(FrameOffset); continue; } // Some instructions (e.g. inline asm instructions) can have // multiple frame indices and/or cause eliminateFrameIndex // to insert more than one instruction. We need the register // scavenger to go through all of these instructions so that // it can update its register information. We keep the // iterator at the point before insertion so that we can // revisit them in full. bool AtBeginning = (I == BB->begin()); if (!AtBeginning) --I; // If this instruction has a FrameIndex operand, we need to // use that target machine register info object to eliminate // it. TRI.eliminateFrameIndex(MI, SPAdj, i, FrameIndexVirtualScavenging ? nullptr : RS); // Reset the iterator if we were at the beginning of the BB. if (AtBeginning) { I = BB->begin(); DoIncr = false; } MI = nullptr; break; } + // If we are looking at a call sequence, we need to keep track of + // the SP adjustment made by each instruction in the sequence. + // This includes both the frame setup/destroy pseudos (handled above), + // as well as other instructions that have side effects w.r.t the SP. + // Note that this must come after eliminateFrameIndex, because + // if I itself referred to a frame index, we shouldn't count its own + // adjustment. + if (MI && InsideCallSequence) + SPAdj += TII.getSPAdjust(MI); + if (DoIncr && I != BB->end()) ++I; // Update register states. if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI); } } /// scavengeFrameVirtualRegs - Replace all frame index virtual registers /// with physical registers. Use the register scavenger to find an /// appropriate register to use. /// /// FIXME: Iterating over the instruction stream is unnecessary. We can simply /// iterate over the vreg use list, which at this point only contains machine /// operands for which eliminateFrameIndex need a new scratch reg. void PEI::scavengeFrameVirtualRegs(MachineFunction &Fn) { // Run through the instructions and find any virtual registers. for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) { RS->enterBasicBlock(BB); int SPAdj = 0; // The instruction stream may change in the loop, so check BB->end() // directly. for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) { // We might end up here again with a NULL iterator if we scavenged a // register for which we inserted spill code for definition by what was // originally the first instruction in BB. if (I == MachineBasicBlock::iterator(nullptr)) I = BB->begin(); MachineInstr *MI = I; MachineBasicBlock::iterator J = std::next(I); MachineBasicBlock::iterator P = I == BB->begin() ? MachineBasicBlock::iterator(nullptr) : std::prev(I); // RS should process this instruction before we might scavenge at this // location. This is because we might be replacing a virtual register // defined by this instruction, and if so, registers killed by this // instruction are available, and defined registers are not. RS->forward(I); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { if (MI->getOperand(i).isReg()) { MachineOperand &MO = MI->getOperand(i); unsigned Reg = MO.getReg(); if (Reg == 0) continue; if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue; // When we first encounter a new virtual register, it // must be a definition. assert(MI->getOperand(i).isDef() && "frame index virtual missing def!"); // Scavenge a new scratch register const TargetRegisterClass *RC = Fn.getRegInfo().getRegClass(Reg); unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj); ++NumScavengedRegs; // Replace this reference to the virtual register with the // scratch register. assert (ScratchReg && "Missing scratch register!"); MachineRegisterInfo &MRI = Fn.getRegInfo(); Fn.getRegInfo().replaceRegWith(Reg, ScratchReg); // Make sure MRI now accounts this register as used. MRI.setPhysRegUsed(ScratchReg); // Because this instruction was processed by the RS before this // register was allocated, make sure that the RS now records the // register as being used. RS->setRegUsed(ScratchReg); } } // If the scavenger needed to use one of its spill slots, the // spill code will have been inserted in between I and J. This is a // problem because we need the spill code before I: Move I to just // prior to J. if (I != std::prev(J)) { BB->splice(J, BB, I); // Before we move I, we need to prepare the RS to visit I again. // Specifically, RS will assert if it sees uses of registers that // it believes are undefined. Because we have already processed // register kills in I, when it visits I again, it will believe that // those registers are undefined. To avoid this situation, unprocess // the instruction I. assert(RS->getCurrentPosition() == I && "The register scavenger has an unexpected position"); I = P; RS->unprocess(P); } else ++I; } } } diff --git a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp index 1557d10238e9..e3f01912b872 100644 --- a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp +++ b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp @@ -1,44 +1,49 @@ //===----- TargetFrameLoweringImpl.cpp - Implement target frame interface --==// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // Implements the layout of a stack frame on the target machine. // //===----------------------------------------------------------------------===// #include "llvm/Target/TargetFrameLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/Target/TargetRegisterInfo.h" #include "llvm/Target/TargetSubtargetInfo.h" #include using namespace llvm; TargetFrameLowering::~TargetFrameLowering() { } /// getFrameIndexOffset - Returns the displacement from the frame register to /// the stack frame of the specified index. This is the default implementation /// which is overridden for some targets. int TargetFrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); return MFI->getObjectOffset(FI) + MFI->getStackSize() - getOffsetOfLocalArea() + MFI->getOffsetAdjustment(); } int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo(); // By default, assume all frame indices are referenced via whatever // getFrameRegister() says. The target can override this if it's doing // something different. FrameReg = RI->getFrameRegister(MF); return getFrameIndexOffset(MF, FI); } + +bool TargetFrameLowering::needsFrameIndexResolution( + const MachineFunction &MF) const { + return MF.getFrameInfo()->hasStackObjects(); +} diff --git a/contrib/llvm/lib/Target/X86/X86.h b/contrib/llvm/lib/Target/X86/X86.h index 8bd5817e528f..219b64d18d1d 100644 --- a/contrib/llvm/lib/Target/X86/X86.h +++ b/contrib/llvm/lib/Target/X86/X86.h @@ -1,72 +1,77 @@ //===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file contains the entry points for global functions defined in the x86 // target library, as used by the LLVM JIT. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_X86_X86_H #define LLVM_LIB_TARGET_X86_X86_H #include "llvm/Support/CodeGen.h" namespace llvm { class FunctionPass; class ImmutablePass; class X86TargetMachine; /// createX86ISelDag - This pass converts a legalized DAG into a /// X86-specific DAG, ready for instruction scheduling. /// FunctionPass *createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel); /// createX86GlobalBaseRegPass - This pass initializes a global base /// register for PIC on x86-32. FunctionPass* createX86GlobalBaseRegPass(); /// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses /// to local-dynamic TLS variables so that the TLS base address for the module /// is only fetched once per execution path through the function. FunctionPass *createCleanupLocalDynamicTLSPass(); /// createX86FloatingPointStackifierPass - This function returns a pass which /// converts floating point register references and pseudo instructions into /// floating point stack references and physical instructions. /// FunctionPass *createX86FloatingPointStackifierPass(); /// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions /// before each call to avoid transition penalty between functions encoded with /// AVX and SSE. FunctionPass *createX86IssueVZeroUpperPass(); /// createX86EmitCodeToMemory - Returns a pass that converts a register /// allocated function into raw machine code in a dynamically /// allocated chunk of memory. /// FunctionPass *createEmitX86CodeToMemory(); /// \brief Creates an X86-specific Target Transformation Info pass. ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM); /// createX86PadShortFunctions - Return a pass that pads short functions /// with NOOPs. This will prevent a stall when returning on the Atom. FunctionPass *createX86PadShortFunctions(); /// createX86FixupLEAs - Return a a pass that selectively replaces /// certain instructions (like add, sub, inc, dec, some shifts, /// and some multiplies) by equivalent LEA instructions, in order /// to eliminate execution delays in some Atom processors. FunctionPass *createX86FixupLEAs(); +/// createX86CallFrameOptimization - Return a pass that optimizes +/// the code-size of x86 call sequences. This is done by replacing +/// esp-relative movs with pushes. +FunctionPass *createX86CallFrameOptimization(); + } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp new file mode 100644 index 000000000000..fae489e77cc0 --- /dev/null +++ b/contrib/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -0,0 +1,400 @@ +//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a pass that optimizes call sequences on x86. +// Currently, it converts movs of function parameters onto the stack into +// pushes. This is beneficial for two main reasons: +// 1) The push instruction encoding is much smaller than an esp-relative mov +// 2) It is possible to push memory arguments directly. So, if the +// the transformation is preformed pre-reg-alloc, it can help relieve +// register pressure. +// +//===----------------------------------------------------------------------===// + +#include + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "X86MachineFunctionInfo.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "x86-cf-opt" + +cl::opt NoX86CFOpt("no-x86-call-frame-opt", + cl::desc("Avoid optimizing x86 call frames for size"), + cl::init(false), cl::Hidden); + +namespace { +class X86CallFrameOptimization : public MachineFunctionPass { +public: + X86CallFrameOptimization() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + +private: + bool shouldPerformTransformation(MachineFunction &MF); + + bool adjustCallSequence(MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I); + + MachineInstr *canFoldIntoRegPush(MachineBasicBlock::iterator FrameSetup, + unsigned Reg); + + const char *getPassName() const override { + return "X86 Optimize Call Frame"; + } + + const TargetInstrInfo *TII; + const TargetFrameLowering *TFL; + const MachineRegisterInfo *MRI; + static char ID; +}; + +char X86CallFrameOptimization::ID = 0; +} + +FunctionPass *llvm::createX86CallFrameOptimization() { + return new X86CallFrameOptimization(); +} + +// This checks whether the transformation is legal and profitable +bool X86CallFrameOptimization::shouldPerformTransformation(MachineFunction &MF) { + if (NoX86CFOpt.getValue()) + return false; + + // We currently only support call sequences where *all* parameters. + // are passed on the stack. + // No point in running this in 64-bit mode, since some arguments are + // passed in-register in all common calling conventions, so the pattern + // we're looking for will never match. + const X86Subtarget &STI = MF.getTarget().getSubtarget(); + if (STI.is64Bit()) + return false; + + // You would expect straight-line code between call-frame setup and + // call-frame destroy. You would be wrong. There are circumstances (e.g. + // CMOV_GR8 expansion of a select that feeds a function call!) where we can + // end up with the setup and the destroy in different basic blocks. + // This is bad, and breaks SP adjustment. + // So, check that all of the frames in the function are closed inside + // the same block, and, for good measure, that there are no nested frames. + int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + for (MachineBasicBlock &BB : MF) { + bool InsideFrameSequence = false; + for (MachineInstr &MI : BB) { + if (MI.getOpcode() == FrameSetupOpcode) { + if (InsideFrameSequence) + return false; + InsideFrameSequence = true; + } + else if (MI.getOpcode() == FrameDestroyOpcode) { + if (!InsideFrameSequence) + return false; + InsideFrameSequence = false; + } + } + + if (InsideFrameSequence) + return false; + } + + // Now that we know the transformation is legal, check if it is + // profitable. + // TODO: Add a heuristic that actually looks at the function, + // and enable this for more cases. + + // This transformation is always a win when we expected to have + // a reserved call frame. Under other circumstances, it may be either + // a win or a loss, and requires a heuristic. + // For now, enable it only for the relatively clear win cases. + bool CannotReserveFrame = MF.getFrameInfo()->hasVarSizedObjects(); + if (CannotReserveFrame) + return true; + + // For now, don't even try to evaluate the profitability when + // not optimizing for size. + AttributeSet FnAttrs = MF.getFunction()->getAttributes(); + bool OptForSize = + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, + Attribute::OptimizeForSize) || + FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize); + + if (!OptForSize) + return false; + + // Stack re-alignment can make this unprofitable even in terms of size. + // As mentioned above, a better heuristic is needed. For now, don't do this + // when the required alignment is above 8. (4 would be the safe choice, but + // some experimentation showed 8 is generally good). + if (TFL->getStackAlignment() > 8) + return false; + + return true; +} + +bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getSubtarget().getInstrInfo(); + TFL = MF.getSubtarget().getFrameLowering(); + MRI = &MF.getRegInfo(); + + if (!shouldPerformTransformation(MF)) + return false; + + int FrameSetupOpcode = TII->getCallFrameSetupOpcode(); + + bool Changed = false; + + for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) + for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) + if (I->getOpcode() == FrameSetupOpcode) + Changed |= adjustCallSequence(MF, *BB, I); + + return Changed; +} + +bool X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + + // Check that this particular call sequence is amenable to the + // transformation. + const X86RegisterInfo &RegInfo = *static_cast( + MF.getSubtarget().getRegisterInfo()); + unsigned StackPtr = RegInfo.getStackRegister(); + int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode(); + + // We expect to enter this at the beginning of a call sequence + assert(I->getOpcode() == TII->getCallFrameSetupOpcode()); + MachineBasicBlock::iterator FrameSetup = I++; + + + // For globals in PIC mode, we can have some LEAs here. + // Ignore them, they don't bother us. + // TODO: Extend this to something that covers more cases. + while (I->getOpcode() == X86::LEA32r) + ++I; + + // We expect a copy instruction here. + // TODO: The copy instruction is a lowering artifact. + // We should also support a copy-less version, where the stack + // pointer is used directly. + if (!I->isCopy() || !I->getOperand(0).isReg()) + return false; + MachineBasicBlock::iterator SPCopy = I++; + StackPtr = SPCopy->getOperand(0).getReg(); + + // Scan the call setup sequence for the pattern we're looking for. + // We only handle a simple case - a sequence of MOV32mi or MOV32mr + // instructions, that push a sequence of 32-bit values onto the stack, with + // no gaps between them. + SmallVector MovVector(4, nullptr); + unsigned int MaxAdjust = FrameSetup->getOperand(0).getImm() / 4; + if (MaxAdjust > 4) + MovVector.resize(MaxAdjust, nullptr); + + do { + int Opcode = I->getOpcode(); + if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) + break; + + // We only want movs of the form: + // movl imm/r32, k(%esp) + // If we run into something else, bail. + // Note that AddrBaseReg may, counter to its name, not be a register, + // but rather a frame index. + // TODO: Support the fi case. This should probably work now that we + // have the infrastructure to track the stack pointer within a call + // sequence. + if (!I->getOperand(X86::AddrBaseReg).isReg() || + (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || + !I->getOperand(X86::AddrScaleAmt).isImm() || + (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || + (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || + (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || + !I->getOperand(X86::AddrDisp).isImm()) + return false; + + int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); + assert(StackDisp >= 0 && "Negative stack displacement when passing parameters"); + + // We really don't want to consider the unaligned case. + if (StackDisp % 4) + return false; + StackDisp /= 4; + + assert((size_t)StackDisp < MovVector.size() && + "Function call has more parameters than the stack is adjusted for."); + + // If the same stack slot is being filled twice, something's fishy. + if (MovVector[StackDisp] != nullptr) + return false; + MovVector[StackDisp] = I; + + ++I; + } while (I != MBB.end()); + + // We now expect the end of the sequence - a call and a stack adjust. + if (I == MBB.end()) + return false; + + // For PCrel calls, we expect an additional COPY of the basereg. + // If we find one, skip it. + if (I->isCopy()) { + if (I->getOperand(1).getReg() == + MF.getInfo()->getGlobalBaseReg()) + ++I; + else + return false; + } + + if (!I->isCall()) + return false; + MachineBasicBlock::iterator Call = I; + if ((++I)->getOpcode() != FrameDestroyOpcode) + return false; + + // Now, go through the vector, and see that we don't have any gaps, + // but only a series of 32-bit MOVs. + + int64_t ExpectedDist = 0; + auto MMI = MovVector.begin(), MME = MovVector.end(); + for (; MMI != MME; ++MMI, ExpectedDist += 4) + if (*MMI == nullptr) + break; + + // If the call had no parameters, do nothing + if (!ExpectedDist) + return false; + + // We are either at the last parameter, or a gap. + // Make sure it's not a gap + for (; MMI != MME; ++MMI) + if (*MMI != nullptr) + return false; + + // Ok, we can in fact do the transformation for this call. + // Do not remove the FrameSetup instruction, but adjust the parameters. + // PEI will end up finalizing the handling of this. + FrameSetup->getOperand(1).setImm(ExpectedDist); + + DebugLoc DL = I->getDebugLoc(); + // Now, iterate through the vector in reverse order, and replace the movs + // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to + // replace uses. + for (int Idx = (ExpectedDist / 4) - 1; Idx >= 0; --Idx) { + MachineBasicBlock::iterator MOV = *MovVector[Idx]; + MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); + if (MOV->getOpcode() == X86::MOV32mi) { + unsigned PushOpcode = X86::PUSHi32; + // If the operand is a small (8-bit) immediate, we can use a + // PUSH instruction with a shorter encoding. + // Note that isImm() may fail even though this is a MOVmi, because + // the operand can also be a symbol. + if (PushOp.isImm()) { + int64_t Val = PushOp.getImm(); + if (isInt<8>(Val)) + PushOpcode = X86::PUSH32i8; + } + BuildMI(MBB, Call, DL, TII->get(PushOpcode)).addOperand(PushOp); + } else { + unsigned int Reg = PushOp.getReg(); + + // If PUSHrmm is not slow on this target, try to fold the source of the + // push into the instruction. + const X86Subtarget &ST = MF.getTarget().getSubtarget(); + bool SlowPUSHrmm = ST.isAtom() || ST.isSLM(); + + // Check that this is legal to fold. Right now, we're extremely + // conservative about that. + MachineInstr *DefMov = nullptr; + if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) { + MachineInstr *Push = BuildMI(MBB, Call, DL, TII->get(X86::PUSH32rmm)); + + unsigned NumOps = DefMov->getDesc().getNumOperands(); + for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) + Push->addOperand(DefMov->getOperand(i)); + + DefMov->eraseFromParent(); + } else { + BuildMI(MBB, Call, DL, TII->get(X86::PUSH32r)).addReg(Reg).getInstr(); + } + } + + MBB.erase(MOV); + } + + // The stack-pointer copy is no longer used in the call sequences. + // There should not be any other users, but we can't commit to that, so: + if (MRI->use_empty(SPCopy->getOperand(0).getReg())) + SPCopy->eraseFromParent(); + + // Once we've done this, we need to make sure PEI doesn't assume a reserved + // frame. + X86MachineFunctionInfo *FuncInfo = MF.getInfo(); + FuncInfo->setHasPushSequences(true); + + return true; +} + +MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush( + MachineBasicBlock::iterator FrameSetup, unsigned Reg) { + // Do an extremely restricted form of load folding. + // ISel will often create patterns like: + // movl 4(%edi), %eax + // movl 8(%edi), %ecx + // movl 12(%edi), %edx + // movl %edx, 8(%esp) + // movl %ecx, 4(%esp) + // movl %eax, (%esp) + // call + // Get rid of those with prejudice. + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return nullptr; + + // Make sure this is the only use of Reg. + if (!MRI->hasOneNonDBGUse(Reg)) + return nullptr; + + MachineBasicBlock::iterator DefMI = MRI->getVRegDef(Reg); + + // Make sure the def is a MOV from memory. + // If the def is an another block, give up. + if (DefMI->getOpcode() != X86::MOV32rm || + DefMI->getParent() != FrameSetup->getParent()) + return nullptr; + + // Be careful with movs that load from a stack slot, since it may get + // resolved incorrectly. + // TODO: Again, we already have the infrastructure, so this should work. + if (!DefMI->getOperand(1).isReg()) + return nullptr; + + // Now, make sure everything else up until the ADJCALLSTACK is a sequence + // of MOVs. To be less conservative would require duplicating a lot of the + // logic from PeepholeOptimizer. + // FIXME: A possibly better approach would be to teach the PeepholeOptimizer + // to be smarter about folding into pushes. + for (auto I = DefMI; I != FrameSetup; ++I) + if (I->getOpcode() != X86::MOV32rm) + return nullptr; + + return DefMI; +} diff --git a/contrib/llvm/lib/Target/X86/X86FastISel.cpp b/contrib/llvm/lib/Target/X86/X86FastISel.cpp index 5d71eac7c05a..688a5447b8e6 100644 --- a/contrib/llvm/lib/Target/X86/X86FastISel.cpp +++ b/contrib/llvm/lib/Target/X86/X86FastISel.cpp @@ -1,3352 +1,3352 @@ //===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the X86-specific support for the FastISel class. Much // of the target-specific code is generated by tablegen in the file // X86GenFastISel.inc, which is #included here. // //===----------------------------------------------------------------------===// #include "X86.h" #include "X86CallingConv.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FastISel.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Operator.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; namespace { class X86FastISel final : public FastISel { /// Subtarget - Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 /// floating point ops. /// When SSE is available, use it for f32 operations. /// When SSE2 is available, use it for f64 operations. bool X86ScalarSSEf64; bool X86ScalarSSEf32; public: explicit X86FastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) : FastISel(funcInfo, libInfo) { Subtarget = &TM.getSubtarget(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); } bool fastSelectInstruction(const Instruction *I) override; /// \brief The specified machine instr operand is a vreg, and that /// vreg is being provided by the specified load instruction. If possible, /// try to fold the load as an operand to the instruction, returning true if /// possible. bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const LoadInst *LI) override; bool fastLowerArguments() override; bool fastLowerCall(CallLoweringInfo &CLI) override; bool fastLowerIntrinsicCall(const IntrinsicInst *II) override; #include "X86GenFastISel.inc" private: bool X86FastEmitCompare(const Value *LHS, const Value *RHS, EVT VT); bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg); bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, MachineMemOperand *MMO = nullptr, bool Aligned = false); bool X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, const X86AddressMode &AM, MachineMemOperand *MMO = nullptr, bool Aligned = false); bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg); bool X86SelectAddress(const Value *V, X86AddressMode &AM); bool X86SelectCallAddress(const Value *V, X86AddressMode &AM); bool X86SelectLoad(const Instruction *I); bool X86SelectStore(const Instruction *I); bool X86SelectRet(const Instruction *I); bool X86SelectCmp(const Instruction *I); bool X86SelectZExt(const Instruction *I); bool X86SelectBranch(const Instruction *I); bool X86SelectShift(const Instruction *I); bool X86SelectDivRem(const Instruction *I); bool X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I); bool X86FastEmitSSESelect(MVT RetVT, const Instruction *I); bool X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I); bool X86SelectSelect(const Instruction *I); bool X86SelectTrunc(const Instruction *I); bool X86SelectFPExt(const Instruction *I); bool X86SelectFPTrunc(const Instruction *I); const X86InstrInfo *getInstrInfo() const { return getTargetMachine()->getSubtargetImpl()->getInstrInfo(); } const X86TargetMachine *getTargetMachine() const { return static_cast(&TM); } bool handleConstantAddresses(const Value *V, X86AddressMode &AM); unsigned X86MaterializeInt(const ConstantInt *CI, MVT VT); unsigned X86MaterializeFP(const ConstantFP *CFP, MVT VT); unsigned X86MaterializeGV(const GlobalValue *GV, MVT VT); unsigned fastMaterializeConstant(const Constant *C) override; unsigned fastMaterializeAlloca(const AllocaInst *C) override; unsigned fastMaterializeFloatZero(const ConstantFP *CF) override; /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is /// computed in an SSE register, not on the X87 floating point stack. bool isScalarFPTypeInSSEReg(EVT VT) const { return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2 (VT == MVT::f32 && X86ScalarSSEf32); // f32 is when SSE1 } bool isTypeLegal(Type *Ty, MVT &VT, bool AllowI1 = false); bool IsMemcpySmall(uint64_t Len); bool TryEmitSmallMemcpy(X86AddressMode DestAM, X86AddressMode SrcAM, uint64_t Len); bool foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, const Value *Cond); }; } // end anonymous namespace. static std::pair getX86ConditionCode(CmpInst::Predicate Predicate) { X86::CondCode CC = X86::COND_INVALID; bool NeedSwap = false; switch (Predicate) { default: break; // Floating-point Predicates case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; case CmpInst::FCMP_OLT: NeedSwap = true; // fall-through case CmpInst::FCMP_OGT: CC = X86::COND_A; break; case CmpInst::FCMP_OLE: NeedSwap = true; // fall-through case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; case CmpInst::FCMP_UGT: NeedSwap = true; // fall-through case CmpInst::FCMP_ULT: CC = X86::COND_B; break; case CmpInst::FCMP_UGE: NeedSwap = true; // fall-through case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; case CmpInst::FCMP_UNO: CC = X86::COND_P; break; case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; case CmpInst::FCMP_OEQ: // fall-through case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; // Integer Predicates case CmpInst::ICMP_EQ: CC = X86::COND_E; break; case CmpInst::ICMP_NE: CC = X86::COND_NE; break; case CmpInst::ICMP_UGT: CC = X86::COND_A; break; case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; case CmpInst::ICMP_ULT: CC = X86::COND_B; break; case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; case CmpInst::ICMP_SGT: CC = X86::COND_G; break; case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; case CmpInst::ICMP_SLT: CC = X86::COND_L; break; case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; } return std::make_pair(CC, NeedSwap); } static std::pair getX86SSEConditionCode(CmpInst::Predicate Predicate) { unsigned CC; bool NeedSwap = false; // SSE Condition code mapping: // 0 - EQ // 1 - LT // 2 - LE // 3 - UNORD // 4 - NEQ // 5 - NLT // 6 - NLE // 7 - ORD switch (Predicate) { default: llvm_unreachable("Unexpected predicate"); case CmpInst::FCMP_OEQ: CC = 0; break; case CmpInst::FCMP_OGT: NeedSwap = true; // fall-through case CmpInst::FCMP_OLT: CC = 1; break; case CmpInst::FCMP_OGE: NeedSwap = true; // fall-through case CmpInst::FCMP_OLE: CC = 2; break; case CmpInst::FCMP_UNO: CC = 3; break; case CmpInst::FCMP_UNE: CC = 4; break; case CmpInst::FCMP_ULE: NeedSwap = true; // fall-through case CmpInst::FCMP_UGE: CC = 5; break; case CmpInst::FCMP_ULT: NeedSwap = true; // fall-through case CmpInst::FCMP_UGT: CC = 6; break; case CmpInst::FCMP_ORD: CC = 7; break; case CmpInst::FCMP_UEQ: case CmpInst::FCMP_ONE: CC = 8; break; } return std::make_pair(CC, NeedSwap); } /// \brief Check if it is possible to fold the condition from the XALU intrinsic /// into the user. The condition code will only be updated on success. bool X86FastISel::foldX86XALUIntrinsic(X86::CondCode &CC, const Instruction *I, const Value *Cond) { if (!isa(Cond)) return false; const auto *EV = cast(Cond); if (!isa(EV->getAggregateOperand())) return false; const auto *II = cast(EV->getAggregateOperand()); MVT RetVT; const Function *Callee = II->getCalledFunction(); Type *RetTy = cast(Callee->getReturnType())->getTypeAtIndex(0U); if (!isTypeLegal(RetTy, RetVT)) return false; if (RetVT != MVT::i32 && RetVT != MVT::i64) return false; X86::CondCode TmpCC; switch (II->getIntrinsicID()) { default: return false; case Intrinsic::sadd_with_overflow: case Intrinsic::ssub_with_overflow: case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: TmpCC = X86::COND_O; break; case Intrinsic::uadd_with_overflow: case Intrinsic::usub_with_overflow: TmpCC = X86::COND_B; break; } // Check if both instructions are in the same basic block. if (II->getParent() != I->getParent()) return false; // Make sure nothing is in the way BasicBlock::const_iterator Start = I; BasicBlock::const_iterator End = II; for (auto Itr = std::prev(Start); Itr != End; --Itr) { // We only expect extractvalue instructions between the intrinsic and the // instruction to be selected. if (!isa(Itr)) return false; // Check that the extractvalue operand comes from the intrinsic. const auto *EVI = cast(Itr); if (EVI->getAggregateOperand() != II) return false; } CC = TmpCC; return true; } bool X86FastISel::isTypeLegal(Type *Ty, MVT &VT, bool AllowI1) { EVT evt = TLI.getValueType(Ty, /*HandleUnknown=*/true); if (evt == MVT::Other || !evt.isSimple()) // Unhandled type. Halt "fast" selection and bail. return false; VT = evt.getSimpleVT(); // For now, require SSE/SSE2 for performing floating-point operations, // since x87 requires additional work. if (VT == MVT::f64 && !X86ScalarSSEf64) return false; if (VT == MVT::f32 && !X86ScalarSSEf32) return false; // Similarly, no f80 support yet. if (VT == MVT::f80) return false; // We only handle legal types. For example, on x86-32 the instruction // selector contains all of the 64-bit instructions from x86-64, // under the assumption that i64 won't be used if the target doesn't // support it. return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT); } #include "X86GenCallingConv.inc" /// X86FastEmitLoad - Emit a machine instruction to load a value of type VT. /// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV. /// Return true and the result register by reference if it is possible. bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM, MachineMemOperand *MMO, unsigned &ResultReg) { // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: case MVT::i8: Opc = X86::MOV8rm; RC = &X86::GR8RegClass; break; case MVT::i16: Opc = X86::MOV16rm; RC = &X86::GR16RegClass; break; case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; case MVT::i64: // Must be in x86-64 mode. Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; case MVT::f32: if (X86ScalarSSEf32) { Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; RC = &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; RC = &X86::RFP32RegClass; } break; case MVT::f64: if (X86ScalarSSEf64) { Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; RC = &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; RC = &X86::RFP64RegClass; } break; case MVT::f80: // No f80 support yet. return false; } ResultReg = createResultReg(RC); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); addFullAddress(MIB, AM); if (MMO) MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } /// X86FastEmitStore - Emit a machine instruction to store a value Val of /// type VT. The address is either pre-computed, consisted of a base ptr, Ptr /// and a displacement offset, or a GlobalAddress, /// i.e. V. Return true if it is possible. bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, const X86AddressMode &AM, MachineMemOperand *MMO, bool Aligned) { // Get opcode and regclass of the output for the given store instruction. unsigned Opc = 0; switch (VT.getSimpleVT().SimpleTy) { case MVT::f80: // No f80 support yet. default: return false; case MVT::i1: { // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::AND8ri), AndResult) .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1); ValReg = AndResult; } // FALLTHROUGH, handling i1 as i8. case MVT::i8: Opc = X86::MOV8mr; break; case MVT::i16: Opc = X86::MOV16mr; break; case MVT::i32: Opc = X86::MOV32mr; break; case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode. case MVT::f32: Opc = X86ScalarSSEf32 ? (Subtarget->hasAVX() ? X86::VMOVSSmr : X86::MOVSSmr) : X86::ST_Fp32m; break; case MVT::f64: Opc = X86ScalarSSEf64 ? (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m; break; case MVT::v4f32: if (Aligned) Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr; else Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr; break; case MVT::v2f64: if (Aligned) Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr; else Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr; break; case MVT::v4i32: case MVT::v2i64: case MVT::v8i16: case MVT::v16i8: if (Aligned) Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr; else Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr; break; } MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); addFullAddress(MIB, AM).addReg(ValReg, getKillRegState(ValIsKill)); if (MMO) MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM, MachineMemOperand *MMO, bool Aligned) { // Handle 'null' like i32/i64 0. if (isa(Val)) Val = Constant::getNullValue(DL.getIntPtrType(Val->getContext())); // If this is a store of a simple constant, fold the constant into the store. if (const ConstantInt *CI = dyn_cast(Val)) { unsigned Opc = 0; bool Signed = true; switch (VT.getSimpleVT().SimpleTy) { default: break; case MVT::i1: Signed = false; // FALLTHROUGH to handle as i8. case MVT::i8: Opc = X86::MOV8mi; break; case MVT::i16: Opc = X86::MOV16mi; break; case MVT::i32: Opc = X86::MOV32mi; break; case MVT::i64: // Must be a 32-bit sign extended value. if (isInt<32>(CI->getSExtValue())) Opc = X86::MOV64mi32; break; } if (Opc) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)); addFullAddress(MIB, AM).addImm(Signed ? (uint64_t) CI->getSExtValue() : CI->getZExtValue()); if (MMO) MIB->addMemOperand(*FuncInfo.MF, MMO); return true; } } unsigned ValReg = getRegForValue(Val); if (ValReg == 0) return false; bool ValKill = hasTrivialKill(Val); return X86FastEmitStore(VT, ValReg, ValKill, AM, MMO, Aligned); } /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of /// type SrcVT to type DstVT using the specified extension opcode Opc (e.g. /// ISD::SIGN_EXTEND). bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT, unsigned &ResultReg) { unsigned RR = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src, /*TODO: Kill=*/false); if (RR == 0) return false; ResultReg = RR; return true; } bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) { // Handle constant address. if (const GlobalValue *GV = dyn_cast(V)) { // Can't handle alternate code models yet. if (TM.getCodeModel() != CodeModel::Small) return false; // Can't handle TLS yet. if (GV->isThreadLocal()) return false; // RIP-relative addresses can't have additional register operands, so if // we've already folded stuff into the addressing mode, just force the // global value into its own register, which we can use as the basereg. if (!Subtarget->isPICStyleRIPRel() || (AM.Base.Reg == 0 && AM.IndexReg == 0)) { // Okay, we've committed to selecting this global. Set up the address. AM.GV = GV; // Allow the subtarget to classify the global. unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM); // If this reference is relative to the pic base, set it now. if (isGlobalRelativeToPICBase(GVFlags)) { // FIXME: How do we know Base.Reg is free?? AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); } // Unless the ABI requires an extra load, return a direct reference to // the global. if (!isGlobalStubReference(GVFlags)) { if (Subtarget->isPICStyleRIPRel()) { // Use rip-relative addressing if we can. Above we verified that the // base and index registers are unused. assert(AM.Base.Reg == 0 && AM.IndexReg == 0); AM.Base.Reg = X86::RIP; } AM.GVOpFlags = GVFlags; return true; } // Ok, we need to do a load from a stub. If we've already loaded from // this stub, reuse the loaded pointer, otherwise emit the load now. DenseMap::iterator I = LocalValueMap.find(V); unsigned LoadReg; if (I != LocalValueMap.end() && I->second != 0) { LoadReg = I->second; } else { // Issue load from stub. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; X86AddressMode StubAM; StubAM.Base.Reg = AM.Base.Reg; StubAM.GV = GV; StubAM.GVOpFlags = GVFlags; // Prepare for inserting code in the local-value area. SavePoint SaveInsertPt = enterLocalValueArea(); if (TLI.getPointerTy() == MVT::i64) { Opc = X86::MOV64rm; RC = &X86::GR64RegClass; if (Subtarget->isPICStyleRIPRel()) StubAM.Base.Reg = X86::RIP; } else { Opc = X86::MOV32rm; RC = &X86::GR32RegClass; } LoadReg = createResultReg(RC); MachineInstrBuilder LoadMI = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), LoadReg); addFullAddress(LoadMI, StubAM); // Ok, back to normal mode. leaveLocalValueArea(SaveInsertPt); // Prevent loading GV stub multiple times in same MBB. LocalValueMap[V] = LoadReg; } // Now construct the final address. Note that the Disp, Scale, // and Index values may already be set here. AM.Base.Reg = LoadReg; AM.GV = nullptr; return true; } } // If all else fails, try to materialize the value in a register. if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { if (AM.Base.Reg == 0) { AM.Base.Reg = getRegForValue(V); return AM.Base.Reg != 0; } if (AM.IndexReg == 0) { assert(AM.Scale == 1 && "Scale with no index!"); AM.IndexReg = getRegForValue(V); return AM.IndexReg != 0; } } return false; } /// X86SelectAddress - Attempt to fill in an address from the given value. /// bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) { SmallVector GEPs; redo_gep: const User *U = nullptr; unsigned Opcode = Instruction::UserOp1; if (const Instruction *I = dyn_cast(V)) { // Don't walk into other basic blocks; it's possible we haven't // visited them yet, so the instructions may not yet be assigned // virtual registers. if (FuncInfo.StaticAllocaMap.count(static_cast(V)) || FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) { Opcode = I->getOpcode(); U = I; } } else if (const ConstantExpr *C = dyn_cast(V)) { Opcode = C->getOpcode(); U = C; } if (PointerType *Ty = dyn_cast(V->getType())) if (Ty->getAddressSpace() > 255) // Fast instruction selection doesn't support the special // address spaces. return false; switch (Opcode) { default: break; case Instruction::BitCast: // Look past bitcasts. return X86SelectAddress(U->getOperand(0), AM); case Instruction::IntToPtr: // Look past no-op inttoptrs. if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) return X86SelectAddress(U->getOperand(0), AM); break; case Instruction::PtrToInt: // Look past no-op ptrtoints. if (TLI.getValueType(U->getType()) == TLI.getPointerTy()) return X86SelectAddress(U->getOperand(0), AM); break; case Instruction::Alloca: { // Do static allocas. const AllocaInst *A = cast(V); DenseMap::iterator SI = FuncInfo.StaticAllocaMap.find(A); if (SI != FuncInfo.StaticAllocaMap.end()) { AM.BaseType = X86AddressMode::FrameIndexBase; AM.Base.FrameIndex = SI->second; return true; } break; } case Instruction::Add: { // Adds of constants are common and easy enough. if (const ConstantInt *CI = dyn_cast(U->getOperand(1))) { uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue(); // They have to fit in the 32-bit signed displacement field though. if (isInt<32>(Disp)) { AM.Disp = (uint32_t)Disp; return X86SelectAddress(U->getOperand(0), AM); } } break; } case Instruction::GetElementPtr: { X86AddressMode SavedAM = AM; // Pattern-match simple GEPs. uint64_t Disp = (int32_t)AM.Disp; unsigned IndexReg = AM.IndexReg; unsigned Scale = AM.Scale; gep_type_iterator GTI = gep_type_begin(U); // Iterate through the indices, folding what we can. Constants can be // folded, and one dynamic index can be handled, if the scale is supported. for (User::const_op_iterator i = U->op_begin() + 1, e = U->op_end(); i != e; ++i, ++GTI) { const Value *Op = *i; if (StructType *STy = dyn_cast(*GTI)) { const StructLayout *SL = DL.getStructLayout(STy); Disp += SL->getElementOffset(cast(Op)->getZExtValue()); continue; } // A array/variable index is always of the form i*S where S is the // constant scale size. See if we can push the scale into immediates. uint64_t S = DL.getTypeAllocSize(GTI.getIndexedType()); for (;;) { if (const ConstantInt *CI = dyn_cast(Op)) { // Constant-offset addressing. Disp += CI->getSExtValue() * S; break; } if (canFoldAddIntoGEP(U, Op)) { // A compatible add with a constant operand. Fold the constant. ConstantInt *CI = cast(cast(Op)->getOperand(1)); Disp += CI->getSExtValue() * S; // Iterate on the other operand. Op = cast(Op)->getOperand(0); continue; } if (IndexReg == 0 && (!AM.GV || !Subtarget->isPICStyleRIPRel()) && (S == 1 || S == 2 || S == 4 || S == 8)) { // Scaled-index addressing. Scale = S; IndexReg = getRegForGEPIndex(Op).first; if (IndexReg == 0) return false; break; } // Unsupported. goto unsupported_gep; } } // Check for displacement overflow. if (!isInt<32>(Disp)) break; AM.IndexReg = IndexReg; AM.Scale = Scale; AM.Disp = (uint32_t)Disp; GEPs.push_back(V); if (const GetElementPtrInst *GEP = dyn_cast(U->getOperand(0))) { // Ok, the GEP indices were covered by constant-offset and scaled-index // addressing. Update the address state and move on to examining the base. V = GEP; goto redo_gep; } else if (X86SelectAddress(U->getOperand(0), AM)) { return true; } // If we couldn't merge the gep value into this addr mode, revert back to // our address and just match the value instead of completely failing. AM = SavedAM; for (SmallVectorImpl::reverse_iterator I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I) if (handleConstantAddresses(*I, AM)) return true; return false; unsupported_gep: // Ok, the GEP indices weren't all covered. break; } } return handleConstantAddresses(V, AM); } /// X86SelectCallAddress - Attempt to fill in an address from the given value. /// bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { const User *U = nullptr; unsigned Opcode = Instruction::UserOp1; const Instruction *I = dyn_cast(V); // Record if the value is defined in the same basic block. // // This information is crucial to know whether or not folding an // operand is valid. // Indeed, FastISel generates or reuses a virtual register for all // operands of all instructions it selects. Obviously, the definition and // its uses must use the same virtual register otherwise the produced // code is incorrect. // Before instruction selection, FunctionLoweringInfo::set sets the virtual // registers for values that are alive across basic blocks. This ensures // that the values are consistently set between across basic block, even // if different instruction selection mechanisms are used (e.g., a mix of // SDISel and FastISel). // For values local to a basic block, the instruction selection process // generates these virtual registers with whatever method is appropriate // for its needs. In particular, FastISel and SDISel do not share the way // local virtual registers are set. // Therefore, this is impossible (or at least unsafe) to share values // between basic blocks unless they use the same instruction selection // method, which is not guarantee for X86. // Moreover, things like hasOneUse could not be used accurately, if we // allow to reference values across basic blocks whereas they are not // alive across basic blocks initially. bool InMBB = true; if (I) { Opcode = I->getOpcode(); U = I; InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock(); } else if (const ConstantExpr *C = dyn_cast(V)) { Opcode = C->getOpcode(); U = C; } switch (Opcode) { default: break; case Instruction::BitCast: // Look past bitcasts if its operand is in the same BB. if (InMBB) return X86SelectCallAddress(U->getOperand(0), AM); break; case Instruction::IntToPtr: // Look past no-op inttoptrs if its operand is in the same BB. if (InMBB && TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy()) return X86SelectCallAddress(U->getOperand(0), AM); break; case Instruction::PtrToInt: // Look past no-op ptrtoints if its operand is in the same BB. if (InMBB && TLI.getValueType(U->getType()) == TLI.getPointerTy()) return X86SelectCallAddress(U->getOperand(0), AM); break; } // Handle constant address. if (const GlobalValue *GV = dyn_cast(V)) { // Can't handle alternate code models yet. if (TM.getCodeModel() != CodeModel::Small) return false; // RIP-relative addresses can't have additional register operands. if (Subtarget->isPICStyleRIPRel() && (AM.Base.Reg != 0 || AM.IndexReg != 0)) return false; // Can't handle DLL Import. if (GV->hasDLLImportStorageClass()) return false; // Can't handle TLS. if (const GlobalVariable *GVar = dyn_cast(GV)) if (GVar->isThreadLocal()) return false; // Okay, we've committed to selecting this global. Set up the basic address. AM.GV = GV; // No ABI requires an extra load for anything other than DLLImport, which // we rejected above. Return a direct reference to the global. if (Subtarget->isPICStyleRIPRel()) { // Use rip-relative addressing if we can. Above we verified that the // base and index registers are unused. assert(AM.Base.Reg == 0 && AM.IndexReg == 0); AM.Base.Reg = X86::RIP; } else if (Subtarget->isPICStyleStubPIC()) { AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET; } else if (Subtarget->isPICStyleGOT()) { AM.GVOpFlags = X86II::MO_GOTOFF; } return true; } // If all else fails, try to materialize the value in a register. if (!AM.GV || !Subtarget->isPICStyleRIPRel()) { if (AM.Base.Reg == 0) { AM.Base.Reg = getRegForValue(V); return AM.Base.Reg != 0; } if (AM.IndexReg == 0) { assert(AM.Scale == 1 && "Scale with no index!"); AM.IndexReg = getRegForValue(V); return AM.IndexReg != 0; } } return false; } /// X86SelectStore - Select and emit code to implement store instructions. bool X86FastISel::X86SelectStore(const Instruction *I) { // Atomic stores need special handling. const StoreInst *S = cast(I); if (S->isAtomic()) return false; const Value *Val = S->getValueOperand(); const Value *Ptr = S->getPointerOperand(); MVT VT; if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true)) return false; unsigned Alignment = S->getAlignment(); unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType()); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = ABIAlignment; bool Aligned = Alignment >= ABIAlignment; X86AddressMode AM; if (!X86SelectAddress(Ptr, AM)) return false; return X86FastEmitStore(VT, Val, AM, createMachineMemOperandFor(I), Aligned); } /// X86SelectRet - Select and emit code to implement ret instructions. bool X86FastISel::X86SelectRet(const Instruction *I) { const ReturnInst *Ret = cast(I); const Function &F = *I->getParent()->getParent(); const X86MachineFunctionInfo *X86MFInfo = FuncInfo.MF->getInfo(); if (!FuncInfo.CanLowerReturn) return false; CallingConv::ID CC = F.getCallingConv(); if (CC != CallingConv::C && CC != CallingConv::Fast && CC != CallingConv::X86_FastCall && CC != CallingConv::X86_64_SysV) return false; if (Subtarget->isCallingConvWin64(CC)) return false; // Don't handle popping bytes on return for now. if (X86MFInfo->getBytesToPopOnReturn() != 0) return false; // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) return false; // Let SDISel handle vararg functions. if (F.isVarArg()) return false; // Build a list of return value registers. SmallVector RetRegs; if (Ret->getNumOperands() > 0) { SmallVector Outs; GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI); // Analyze operands of the call, assigning locations to each operand. SmallVector ValLocs; CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, ValLocs, I->getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); const Value *RV = Ret->getOperand(0); unsigned Reg = getRegForValue(RV); if (Reg == 0) return false; // Only handle a single return value for now. if (ValLocs.size() != 1) return false; CCValAssign &VA = ValLocs[0]; // Don't bother handling odd stuff for now. if (VA.getLocInfo() != CCValAssign::Full) return false; // Only handle register returns for now. if (!VA.isRegLoc()) return false; // The calling-convention tables for x87 returns don't tell // the whole story. if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) return false; unsigned SrcReg = Reg + VA.getValNo(); EVT SrcVT = TLI.getValueType(RV->getType()); EVT DstVT = VA.getValVT(); // Special handling for extended integers. if (SrcVT != DstVT) { if (SrcVT != MVT::i1 && SrcVT != MVT::i8 && SrcVT != MVT::i16) return false; if (!Outs[0].Flags.isZExt() && !Outs[0].Flags.isSExt()) return false; assert(DstVT == MVT::i32 && "X86 should always ext to i32"); if (SrcVT == MVT::i1) { if (Outs[0].Flags.isSExt()) return false; SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; } unsigned Op = Outs[0].Flags.isZExt() ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; SrcReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Op, SrcReg, /*TODO: Kill=*/false); } // Make the copy. unsigned DstReg = VA.getLocReg(); const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg); // Avoid a cross-class copy. This is very unlikely. if (!SrcRC->contains(DstReg)) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), DstReg).addReg(SrcReg); // Add register to return instruction. RetRegs.push_back(VA.getLocReg()); } // The x86-64 ABI for returning structs by value requires that we copy // the sret argument into %rax for the return. We saved the argument into // a virtual register in the entry block, so now we copy the value out // and into %rax. We also do the same with %eax for Win32. if (F.hasStructRetAttr() && (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { unsigned Reg = X86MFInfo->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()!"); unsigned RetReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), RetReg).addReg(Reg); RetRegs.push_back(RetReg); } // Now emit the RET. MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Subtarget->is64Bit() ? X86::RETQ : X86::RETL)); for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) MIB.addReg(RetRegs[i], RegState::Implicit); return true; } /// X86SelectLoad - Select and emit code to implement load instructions. /// bool X86FastISel::X86SelectLoad(const Instruction *I) { const LoadInst *LI = cast(I); // Atomic loads need special handling. if (LI->isAtomic()) return false; MVT VT; if (!isTypeLegal(LI->getType(), VT, /*AllowI1=*/true)) return false; const Value *Ptr = LI->getPointerOperand(); X86AddressMode AM; if (!X86SelectAddress(Ptr, AM)) return false; unsigned ResultReg = 0; if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg)) return false; updateValueMap(I, ResultReg); return true; } static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { bool HasAVX = Subtarget->hasAVX(); bool X86ScalarSSEf32 = Subtarget->hasSSE1(); bool X86ScalarSSEf64 = Subtarget->hasSSE2(); switch (VT.getSimpleVT().SimpleTy) { default: return 0; case MVT::i8: return X86::CMP8rr; case MVT::i16: return X86::CMP16rr; case MVT::i32: return X86::CMP32rr; case MVT::i64: return X86::CMP64rr; case MVT::f32: return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0; case MVT::f64: return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0; } } /// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS /// of the comparison, return an opcode that works for the compare (e.g. /// CMP32ri) otherwise return 0. static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) { switch (VT.getSimpleVT().SimpleTy) { // Otherwise, we can't fold the immediate into this comparison. default: return 0; case MVT::i8: return X86::CMP8ri; case MVT::i16: return X86::CMP16ri; case MVT::i32: return X86::CMP32ri; case MVT::i64: // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext // field. if ((int)RHSC->getSExtValue() == RHSC->getSExtValue()) return X86::CMP64ri32; return 0; } } bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT) { unsigned Op0Reg = getRegForValue(Op0); if (Op0Reg == 0) return false; // Handle 'null' like i32/i64 0. if (isa(Op1)) Op1 = Constant::getNullValue(DL.getIntPtrType(Op0->getContext())); // We have two options: compare with register or immediate. If the RHS of // the compare is an immediate that we can fold into this compare, use // CMPri, otherwise use CMPrr. if (const ConstantInt *Op1C = dyn_cast(Op1)) { if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareImmOpc)) .addReg(Op0Reg) .addImm(Op1C->getSExtValue()); return true; } } unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget); if (CompareOpc == 0) return false; unsigned Op1Reg = getRegForValue(Op1); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CompareOpc)) .addReg(Op0Reg) .addReg(Op1Reg); return true; } bool X86FastISel::X86SelectCmp(const Instruction *I) { const CmpInst *CI = cast(I); MVT VT; if (!isTypeLegal(I->getOperand(0)->getType(), VT)) return false; // Try to optimize or fold the cmp. CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); unsigned ResultReg = 0; switch (Predicate) { default: break; case CmpInst::FCMP_FALSE: { ResultReg = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), ResultReg); ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, X86::sub_8bit); if (!ResultReg) return false; break; } case CmpInst::FCMP_TRUE: { ResultReg = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), ResultReg).addImm(1); break; } } if (ResultReg) { updateValueMap(I, ResultReg); return true; } const Value *LHS = CI->getOperand(0); const Value *RHS = CI->getOperand(1); // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. // We don't have to materialize a zero constant for this case and can just use // %x again on the RHS. if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { const auto *RHSC = dyn_cast(RHS); if (RHSC && RHSC->isNullValue()) RHS = LHS; } // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. static unsigned SETFOpcTable[2][3] = { { X86::SETEr, X86::SETNPr, X86::AND8rr }, { X86::SETNEr, X86::SETPr, X86::OR8rr } }; unsigned *SETFOpc = nullptr; switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; break; case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; break; } ResultReg = createResultReg(&X86::GR8RegClass); if (SETFOpc) { if (!X86FastEmitCompare(LHS, RHS, VT)) return false; unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), FlagReg1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), FlagReg2); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[2]), ResultReg).addReg(FlagReg1).addReg(FlagReg2); updateValueMap(I, ResultReg); return true; } X86::CondCode CC; bool SwapArgs; std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); unsigned Opc = X86::getSETFromCond(CC); if (SwapArgs) std::swap(LHS, RHS); // Emit a compare of LHS/RHS. if (!X86FastEmitCompare(LHS, RHS, VT)) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectZExt(const Instruction *I) { EVT DstVT = TLI.getValueType(I->getType()); if (!TLI.isTypeLegal(DstVT)) return false; unsigned ResultReg = getRegForValue(I->getOperand(0)); if (ResultReg == 0) return false; // Handle zero-extension from i1 to i8, which is common. MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType()); if (SrcVT.SimpleTy == MVT::i1) { // Set the high bits to zero. ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; if (ResultReg == 0) return false; } if (DstVT == MVT::i64) { // Handle extension to 64-bits via sub-register shenanigans. unsigned MovInst; switch (SrcVT.SimpleTy) { case MVT::i8: MovInst = X86::MOVZX32rr8; break; case MVT::i16: MovInst = X86::MOVZX32rr16; break; case MVT::i32: MovInst = X86::MOV32rr; break; default: llvm_unreachable("Unexpected zext to i64 source type"); } unsigned Result32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32) .addReg(ResultReg); ResultReg = createResultReg(&X86::GR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) .addImm(0).addReg(Result32).addImm(X86::sub_32bit); } else if (DstVT != MVT::i8) { ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, ResultReg, /*Kill=*/true); if (ResultReg == 0) return false; } updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectBranch(const Instruction *I) { // Unconditional branches are selected by tablegen-generated code. // Handle a conditional branch. const BranchInst *BI = cast(I); MachineBasicBlock *TrueMBB = FuncInfo.MBBMap[BI->getSuccessor(0)]; MachineBasicBlock *FalseMBB = FuncInfo.MBBMap[BI->getSuccessor(1)]; // Fold the common case of a conditional branch with a comparison // in the same block (values defined on other blocks may not have // initialized registers). X86::CondCode CC; if (const CmpInst *CI = dyn_cast(BI->getCondition())) { if (CI->hasOneUse() && CI->getParent() == I->getParent()) { EVT VT = TLI.getValueType(CI->getOperand(0)->getType()); // Try to optimize or fold the cmp. CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); switch (Predicate) { default: break; case CmpInst::FCMP_FALSE: fastEmitBranch(FalseMBB, DbgLoc); return true; case CmpInst::FCMP_TRUE: fastEmitBranch(TrueMBB, DbgLoc); return true; } const Value *CmpLHS = CI->getOperand(0); const Value *CmpRHS = CI->getOperand(1); // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, // 0.0. // We don't have to materialize a zero constant for this case and can just // use %x again on the RHS. if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { const auto *CmpRHSC = dyn_cast(CmpRHS); if (CmpRHSC && CmpRHSC->isNullValue()) CmpRHS = CmpLHS; } // Try to take advantage of fallthrough opportunities. if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); Predicate = CmpInst::getInversePredicate(Predicate); } // FCMP_OEQ and FCMP_UNE cannot be expressed with a single flag/condition // code check. Instead two branch instructions are required to check all // the flags. First we change the predicate to a supported condition code, // which will be the first branch. Later one we will emit the second // branch. bool NeedExtraBranch = false; switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: std::swap(TrueMBB, FalseMBB); // fall-through case CmpInst::FCMP_UNE: NeedExtraBranch = true; Predicate = CmpInst::FCMP_ONE; break; } bool SwapArgs; unsigned BranchOpc; std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); BranchOpc = X86::GetCondBranchFromCond(CC); if (SwapArgs) std::swap(CmpLHS, CmpRHS); // Emit a compare of the LHS and RHS, setting the flags. if (!X86FastEmitCompare(CmpLHS, CmpRHS, VT)) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); // X86 requires a second branch to handle UNE (and OEQ, which is mapped // to UNE above). if (NeedExtraBranch) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JP_1)) .addMBB(TrueMBB); } // Obtain the branch weight and add the TrueBB to the successor list. uint32_t BranchWeight = 0; if (FuncInfo.BPI) BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), TrueMBB->getBasicBlock()); FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); // Emits an unconditional branch to the FalseBB, obtains the branch // weight, and adds it to the successor list. fastEmitBranch(FalseMBB, DbgLoc); return true; } } else if (TruncInst *TI = dyn_cast(BI->getCondition())) { // Handle things like "%cond = trunc i32 %X to i1 / br i1 %cond", which // typically happen for _Bool and C++ bools. MVT SourceVT; if (TI->hasOneUse() && TI->getParent() == I->getParent() && isTypeLegal(TI->getOperand(0)->getType(), SourceVT)) { unsigned TestOpc = 0; switch (SourceVT.SimpleTy) { default: break; case MVT::i8: TestOpc = X86::TEST8ri; break; case MVT::i16: TestOpc = X86::TEST16ri; break; case MVT::i32: TestOpc = X86::TEST32ri; break; case MVT::i64: TestOpc = X86::TEST64ri32; break; } if (TestOpc) { unsigned OpReg = getRegForValue(TI->getOperand(0)); if (OpReg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc)) .addReg(OpReg).addImm(1); unsigned JmpOpc = X86::JNE_1; if (FuncInfo.MBB->isLayoutSuccessor(TrueMBB)) { std::swap(TrueMBB, FalseMBB); JmpOpc = X86::JE_1; } BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(JmpOpc)) .addMBB(TrueMBB); fastEmitBranch(FalseMBB, DbgLoc); uint32_t BranchWeight = 0; if (FuncInfo.BPI) BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), TrueMBB->getBasicBlock()); FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); return true; } } } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) { // Fake request the condition, otherwise the intrinsic might be completely // optimized away. unsigned TmpReg = getRegForValue(BI->getCondition()); if (TmpReg == 0) return false; unsigned BranchOpc = X86::GetCondBranchFromCond(CC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) .addMBB(TrueMBB); fastEmitBranch(FalseMBB, DbgLoc); uint32_t BranchWeight = 0; if (FuncInfo.BPI) BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), TrueMBB->getBasicBlock()); FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); return true; } // Otherwise do a clumsy setcc and re-test it. // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used // in an explicit cast, so make sure to handle that correctly. unsigned OpReg = getRegForValue(BI->getCondition()); if (OpReg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(OpReg).addImm(1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::JNE_1)) .addMBB(TrueMBB); fastEmitBranch(FalseMBB, DbgLoc); uint32_t BranchWeight = 0; if (FuncInfo.BPI) BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), TrueMBB->getBasicBlock()); FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); return true; } bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned CReg = 0, OpReg = 0; const TargetRegisterClass *RC = nullptr; if (I->getType()->isIntegerTy(8)) { CReg = X86::CL; RC = &X86::GR8RegClass; switch (I->getOpcode()) { case Instruction::LShr: OpReg = X86::SHR8rCL; break; case Instruction::AShr: OpReg = X86::SAR8rCL; break; case Instruction::Shl: OpReg = X86::SHL8rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(16)) { CReg = X86::CX; RC = &X86::GR16RegClass; switch (I->getOpcode()) { case Instruction::LShr: OpReg = X86::SHR16rCL; break; case Instruction::AShr: OpReg = X86::SAR16rCL; break; case Instruction::Shl: OpReg = X86::SHL16rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(32)) { CReg = X86::ECX; RC = &X86::GR32RegClass; switch (I->getOpcode()) { case Instruction::LShr: OpReg = X86::SHR32rCL; break; case Instruction::AShr: OpReg = X86::SAR32rCL; break; case Instruction::Shl: OpReg = X86::SHL32rCL; break; default: return false; } } else if (I->getType()->isIntegerTy(64)) { CReg = X86::RCX; RC = &X86::GR64RegClass; switch (I->getOpcode()) { case Instruction::LShr: OpReg = X86::SHR64rCL; break; case Instruction::AShr: OpReg = X86::SAR64rCL; break; case Instruction::Shl: OpReg = X86::SHL64rCL; break; default: return false; } } else { return false; } MVT VT; if (!isTypeLegal(I->getType(), VT)) return false; unsigned Op0Reg = getRegForValue(I->getOperand(0)); if (Op0Reg == 0) return false; unsigned Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CReg).addReg(Op1Reg); // The shift instruction uses X86::CL. If we defined a super-register // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. if (CReg != X86::CL) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::KILL), X86::CL) .addReg(CReg, RegState::Kill); unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) .addReg(Op0Reg); updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectDivRem(const Instruction *I) { const static unsigned NumTypes = 4; // i8, i16, i32, i64 const static unsigned NumOps = 4; // SDiv, SRem, UDiv, URem const static bool S = true; // IsSigned const static bool U = false; // !IsSigned const static unsigned Copy = TargetOpcode::COPY; // For the X86 DIV/IDIV instruction, in most cases the dividend // (numerator) must be in a specific register pair highreg:lowreg, // producing the quotient in lowreg and the remainder in highreg. // For most data types, to set up the instruction, the dividend is // copied into lowreg, and lowreg is sign-extended or zero-extended // into highreg. The exception is i8, where the dividend is defined // as a single register rather than a register pair, and we // therefore directly sign-extend or zero-extend the dividend into // lowreg, instead of copying, and ignore the highreg. const static struct DivRemEntry { // The following portion depends only on the data type. const TargetRegisterClass *RC; unsigned LowInReg; // low part of the register pair unsigned HighInReg; // high part of the register pair // The following portion depends on both the data type and the operation. struct DivRemResult { unsigned OpDivRem; // The specific DIV/IDIV opcode to use. unsigned OpSignExtend; // Opcode for sign-extending lowreg into // highreg, or copying a zero into highreg. unsigned OpCopy; // Opcode for copying dividend into lowreg, or // zero/sign-extending into lowreg for i8. unsigned DivRemResultReg; // Register containing the desired result. bool IsOpSigned; // Whether to use signed or unsigned form. } ResultTable[NumOps]; } OpTable[NumTypes] = { { &X86::GR8RegClass, X86::AX, 0, { { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AL, S }, // SDiv { X86::IDIV8r, 0, X86::MOVSX16rr8, X86::AH, S }, // SRem { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AL, U }, // UDiv { X86::DIV8r, 0, X86::MOVZX16rr8, X86::AH, U }, // URem } }, // i8 { &X86::GR16RegClass, X86::AX, X86::DX, { { X86::IDIV16r, X86::CWD, Copy, X86::AX, S }, // SDiv { X86::IDIV16r, X86::CWD, Copy, X86::DX, S }, // SRem { X86::DIV16r, X86::MOV32r0, Copy, X86::AX, U }, // UDiv { X86::DIV16r, X86::MOV32r0, Copy, X86::DX, U }, // URem } }, // i16 { &X86::GR32RegClass, X86::EAX, X86::EDX, { { X86::IDIV32r, X86::CDQ, Copy, X86::EAX, S }, // SDiv { X86::IDIV32r, X86::CDQ, Copy, X86::EDX, S }, // SRem { X86::DIV32r, X86::MOV32r0, Copy, X86::EAX, U }, // UDiv { X86::DIV32r, X86::MOV32r0, Copy, X86::EDX, U }, // URem } }, // i32 { &X86::GR64RegClass, X86::RAX, X86::RDX, { { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem } }, // i64 }; MVT VT; if (!isTypeLegal(I->getType(), VT)) return false; unsigned TypeIndex, OpIndex; switch (VT.SimpleTy) { default: return false; case MVT::i8: TypeIndex = 0; break; case MVT::i16: TypeIndex = 1; break; case MVT::i32: TypeIndex = 2; break; case MVT::i64: TypeIndex = 3; if (!Subtarget->is64Bit()) return false; break; } switch (I->getOpcode()) { default: llvm_unreachable("Unexpected div/rem opcode"); case Instruction::SDiv: OpIndex = 0; break; case Instruction::SRem: OpIndex = 1; break; case Instruction::UDiv: OpIndex = 2; break; case Instruction::URem: OpIndex = 3; break; } const DivRemEntry &TypeEntry = OpTable[TypeIndex]; const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex]; unsigned Op0Reg = getRegForValue(I->getOperand(0)); if (Op0Reg == 0) return false; unsigned Op1Reg = getRegForValue(I->getOperand(1)); if (Op1Reg == 0) return false; // Move op0 into low-order input register. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpEntry.OpCopy), TypeEntry.LowInReg).addReg(Op0Reg); // Zero-extend or sign-extend into high-order input register. if (OpEntry.OpSignExtend) { if (OpEntry.IsOpSigned) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpEntry.OpSignExtend)); else { unsigned Zero32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV32r0), Zero32); // Copy the zero into the appropriate sub/super/identical physical // register. Unfortunately the operations needed are not uniform enough // to fit neatly into the table above. if (VT.SimpleTy == MVT::i16) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) .addReg(Zero32, 0, X86::sub_16bit); } else if (VT.SimpleTy == MVT::i32) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) .addReg(Zero32); } else if (VT.SimpleTy == MVT::i64) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); } } } // Generate the DIV/IDIV instruction. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); // For i8 remainder, we can't reference AH directly, as we'll end // up with bogus copies like %R9B = COPY %AH. Reference AX // instead to prevent AH references in a REX instruction. // // The current assumption of the fast register allocator is that isel // won't generate explicit references to the GPR8_NOREX registers. If // the allocator and/or the backend get enhanced to be more robust in // that regard, this can be, and should be, removed. unsigned ResultReg = 0; if ((I->getOpcode() == Instruction::SRem || I->getOpcode() == Instruction::URem) && OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) { unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass); unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), SourceSuperReg).addReg(X86::AX); // Shift AX right by 8 bits instead of using AH. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SHR16ri), ResultSuperReg).addReg(SourceSuperReg).addImm(8); // Now reference the 8-bit subreg of the result. ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultSuperReg, /*Kill=*/true, X86::sub_8bit); } // Copy the result out of the physreg if we haven't already. if (!ResultReg) { ResultReg = createResultReg(TypeEntry.RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), ResultReg) .addReg(OpEntry.DivRemResultReg); } updateValueMap(I, ResultReg); return true; } /// \brief Emit a conditional move instruction (if the are supported) to lower /// the select. bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { // Check if the subtarget supports these instructions. if (!Subtarget->hasCMov()) return false; // FIXME: Add support for i8. if (RetVT < MVT::i16 || RetVT > MVT::i64) return false; const Value *Cond = I->getOperand(0); const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); bool NeedTest = true; X86::CondCode CC = X86::COND_NE; // Optimize conditions coming from a compare if both instructions are in the // same basic block (values defined in other basic blocks may not have // initialized registers). const auto *CI = dyn_cast(Cond); if (CI && (CI->getParent() == I->getParent())) { CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); // FCMP_OEQ and FCMP_UNE cannot be checked with a single instruction. static unsigned SETFOpcTable[2][3] = { { X86::SETNPr, X86::SETEr , X86::TEST8rr }, { X86::SETPr, X86::SETNEr, X86::OR8rr } }; unsigned *SETFOpc = nullptr; switch (Predicate) { default: break; case CmpInst::FCMP_OEQ: SETFOpc = &SETFOpcTable[0][0]; Predicate = CmpInst::ICMP_NE; break; case CmpInst::FCMP_UNE: SETFOpc = &SETFOpcTable[1][0]; Predicate = CmpInst::ICMP_NE; break; } bool NeedSwap; std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); const Value *CmpLHS = CI->getOperand(0); const Value *CmpRHS = CI->getOperand(1); if (NeedSwap) std::swap(CmpLHS, CmpRHS); EVT CmpVT = TLI.getValueType(CmpLHS->getType()); // Emit a compare of the LHS and RHS, setting the flags. if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) return false; if (SETFOpc) { unsigned FlagReg1 = createResultReg(&X86::GR8RegClass); unsigned FlagReg2 = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[0]), FlagReg1); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(SETFOpc[1]), FlagReg2); auto const &II = TII.get(SETFOpc[2]); if (II.getNumDefs()) { unsigned TmpReg = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg) .addReg(FlagReg2).addReg(FlagReg1); } else { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II) .addReg(FlagReg2).addReg(FlagReg1); } } NeedTest = false; } else if (foldX86XALUIntrinsic(CC, I, Cond)) { // Fake request the condition, otherwise the intrinsic might be completely // optimized away. unsigned TmpReg = getRegForValue(Cond); if (TmpReg == 0) return false; NeedTest = false; } if (NeedTest) { // Selects operate on i1, however, CondReg is 8 bits width and may contain // garbage. Indeed, only the less significant bit is supposed to be // accurate. If we read more than the lsb, we may see non-zero values // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for // the select. This is achieved by performing TEST against 1. unsigned CondReg = getRegForValue(Cond); if (CondReg == 0) return false; bool CondIsKill = hasTrivialKill(Cond); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); } const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); unsigned RHSReg = getRegForValue(RHS); bool RHSIsKill = hasTrivialKill(RHS); unsigned LHSReg = getRegForValue(LHS); bool LHSIsKill = hasTrivialKill(LHS); if (!LHSReg || !RHSReg) return false; unsigned Opc = X86::getCMovFromCond(CC, RC->getSize()); unsigned ResultReg = fastEmitInst_rr(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill); updateValueMap(I, ResultReg); return true; } /// \brief Emit SSE instructions to lower the select. /// /// Try to use SSE1/SSE2 instructions to simulate a select without branches. /// This lowers fp selects into a CMP/AND/ANDN/OR sequence when the necessary /// SSE instructions are available. bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { // Optimize conditions coming from a compare if both instructions are in the // same basic block (values defined in other basic blocks may not have // initialized registers). const auto *CI = dyn_cast(I->getOperand(0)); if (!CI || (CI->getParent() != I->getParent())) return false; if (I->getType() != CI->getOperand(0)->getType() || !((Subtarget->hasSSE1() && RetVT == MVT::f32) || (Subtarget->hasSSE2() && RetVT == MVT::f64))) return false; const Value *CmpLHS = CI->getOperand(0); const Value *CmpRHS = CI->getOperand(1); CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); // The optimizer might have replaced fcmp oeq %x, %x with fcmp ord %x, 0.0. // We don't have to materialize a zero constant for this case and can just use // %x again on the RHS. if (Predicate == CmpInst::FCMP_ORD || Predicate == CmpInst::FCMP_UNO) { const auto *CmpRHSC = dyn_cast(CmpRHS); if (CmpRHSC && CmpRHSC->isNullValue()) CmpRHS = CmpLHS; } unsigned CC; bool NeedSwap; std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); if (CC > 7) return false; if (NeedSwap) std::swap(CmpLHS, CmpRHS); static unsigned OpcTable[2][2][4] = { { { X86::CMPSSrr, X86::FsANDPSrr, X86::FsANDNPSrr, X86::FsORPSrr }, { X86::VCMPSSrr, X86::VFsANDPSrr, X86::VFsANDNPSrr, X86::VFsORPSrr } }, { { X86::CMPSDrr, X86::FsANDPDrr, X86::FsANDNPDrr, X86::FsORPDrr }, { X86::VCMPSDrr, X86::VFsANDPDrr, X86::VFsANDNPDrr, X86::VFsORPDrr } } }; bool HasAVX = Subtarget->hasAVX(); unsigned *Opc = nullptr; switch (RetVT.SimpleTy) { default: return false; case MVT::f32: Opc = &OpcTable[0][HasAVX][0]; break; case MVT::f64: Opc = &OpcTable[1][HasAVX][0]; break; } const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); unsigned LHSReg = getRegForValue(LHS); bool LHSIsKill = hasTrivialKill(LHS); unsigned RHSReg = getRegForValue(RHS); bool RHSIsKill = hasTrivialKill(RHS); unsigned CmpLHSReg = getRegForValue(CmpLHS); bool CmpLHSIsKill = hasTrivialKill(CmpLHS); unsigned CmpRHSReg = getRegForValue(CmpRHS); bool CmpRHSIsKill = hasTrivialKill(CmpRHS); if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS) return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill, CmpRHSReg, CmpRHSIsKill, CC); unsigned AndReg = fastEmitInst_rr(Opc[1], RC, CmpReg, /*IsKill=*/false, LHSReg, LHSIsKill); unsigned AndNReg = fastEmitInst_rr(Opc[2], RC, CmpReg, /*IsKill=*/true, RHSReg, RHSIsKill); unsigned ResultReg = fastEmitInst_rr(Opc[3], RC, AndNReg, /*IsKill=*/true, AndReg, /*IsKill=*/true); updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { // These are pseudo CMOV instructions and will be later expanded into control- // flow. unsigned Opc; switch (RetVT.SimpleTy) { default: return false; case MVT::i8: Opc = X86::CMOV_GR8; break; case MVT::i16: Opc = X86::CMOV_GR16; break; case MVT::i32: Opc = X86::CMOV_GR32; break; case MVT::f32: Opc = X86::CMOV_FR32; break; case MVT::f64: Opc = X86::CMOV_FR64; break; } const Value *Cond = I->getOperand(0); X86::CondCode CC = X86::COND_NE; // Optimize conditions coming from a compare if both instructions are in the // same basic block (values defined in other basic blocks may not have // initialized registers). const auto *CI = dyn_cast(Cond); if (CI && (CI->getParent() == I->getParent())) { bool NeedSwap; std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); if (CC > X86::LAST_VALID_COND) return false; const Value *CmpLHS = CI->getOperand(0); const Value *CmpRHS = CI->getOperand(1); if (NeedSwap) std::swap(CmpLHS, CmpRHS); EVT CmpVT = TLI.getValueType(CmpLHS->getType()); if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT)) return false; } else { unsigned CondReg = getRegForValue(Cond); if (CondReg == 0) return false; bool CondIsKill = hasTrivialKill(Cond); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri)) .addReg(CondReg, getKillRegState(CondIsKill)).addImm(1); } const Value *LHS = I->getOperand(1); const Value *RHS = I->getOperand(2); unsigned LHSReg = getRegForValue(LHS); bool LHSIsKill = hasTrivialKill(LHS); unsigned RHSReg = getRegForValue(RHS); bool RHSIsKill = hasTrivialKill(RHS); if (!LHSReg || !RHSReg) return false; const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); unsigned ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC); updateValueMap(I, ResultReg); return true; } bool X86FastISel::X86SelectSelect(const Instruction *I) { MVT RetVT; if (!isTypeLegal(I->getType(), RetVT)) return false; // Check if we can fold the select. if (const auto *CI = dyn_cast(I->getOperand(0))) { CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); const Value *Opnd = nullptr; switch (Predicate) { default: break; case CmpInst::FCMP_FALSE: Opnd = I->getOperand(2); break; case CmpInst::FCMP_TRUE: Opnd = I->getOperand(1); break; } // No need for a select anymore - this is an unconditional move. if (Opnd) { unsigned OpReg = getRegForValue(Opnd); if (OpReg == 0) return false; bool OpIsKill = hasTrivialKill(Opnd); const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT); unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(OpReg, getKillRegState(OpIsKill)); updateValueMap(I, ResultReg); return true; } } // First try to use real conditional move instructions. if (X86FastEmitCMoveSelect(RetVT, I)) return true; // Try to use a sequence of SSE instructions to simulate a conditional move. if (X86FastEmitSSESelect(RetVT, I)) return true; // Fall-back to pseudo conditional move instructions, which will be later // converted to control-flow. if (X86FastEmitPseudoSelect(RetVT, I)) return true; return false; } bool X86FastISel::X86SelectFPExt(const Instruction *I) { // fpext from float to double. if (X86ScalarSSEf64 && I->getType()->isDoubleTy()) { const Value *V = I->getOperand(0); if (V->getType()->isFloatTy()) { unsigned OpReg = getRegForValue(V); if (OpReg == 0) return false; unsigned ResultReg = createResultReg(&X86::FR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::CVTSS2SDrr), ResultReg) .addReg(OpReg); updateValueMap(I, ResultReg); return true; } } return false; } bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { if (X86ScalarSSEf64) { if (I->getType()->isFloatTy()) { const Value *V = I->getOperand(0); if (V->getType()->isDoubleTy()) { unsigned OpReg = getRegForValue(V); if (OpReg == 0) return false; unsigned ResultReg = createResultReg(&X86::FR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::CVTSD2SSrr), ResultReg) .addReg(OpReg); updateValueMap(I, ResultReg); return true; } } } return false; } bool X86FastISel::X86SelectTrunc(const Instruction *I) { EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); EVT DstVT = TLI.getValueType(I->getType()); // This code only handles truncation to byte. if (DstVT != MVT::i8 && DstVT != MVT::i1) return false; if (!TLI.isTypeLegal(SrcVT)) return false; unsigned InputReg = getRegForValue(I->getOperand(0)); if (!InputReg) // Unhandled operand. Halt "fast" selection and bail. return false; if (SrcVT == MVT::i8) { // Truncate from i8 to i1; no code needed. updateValueMap(I, InputReg); return true; } if (!Subtarget->is64Bit()) { // If we're on x86-32; we can't extract an i8 from a general register. // First issue a copy to GR16_ABCD or GR32_ABCD. const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass; unsigned CopyReg = createResultReg(CopyRC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg); InputReg = CopyReg; } // Issue an extract_subreg. unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8, InputReg, /*Kill=*/true, X86::sub_8bit); if (!ResultReg) return false; updateValueMap(I, ResultReg); return true; } bool X86FastISel::IsMemcpySmall(uint64_t Len) { return Len <= (Subtarget->is64Bit() ? 32 : 16); } bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM, X86AddressMode SrcAM, uint64_t Len) { // Make sure we don't bloat code by inlining very large memcpy's. if (!IsMemcpySmall(Len)) return false; bool i64Legal = Subtarget->is64Bit(); // We don't care about alignment here since we just emit integer accesses. while (Len) { MVT VT; if (Len >= 8 && i64Legal) VT = MVT::i64; else if (Len >= 4) VT = MVT::i32; else if (Len >= 2) VT = MVT::i16; else VT = MVT::i8; unsigned Reg; bool RV = X86FastEmitLoad(VT, SrcAM, nullptr, Reg); RV &= X86FastEmitStore(VT, Reg, /*Kill=*/true, DestAM); assert(RV && "Failed to emit load or store??"); unsigned Size = VT.getSizeInBits()/8; Len -= Size; DestAM.Disp += Size; SrcAM.Disp += Size; } return true; } bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) { // FIXME: Handle more intrinsics. switch (II->getIntrinsicID()) { default: return false; case Intrinsic::frameaddress: { Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; unsigned Opc; const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: llvm_unreachable("Invalid result type for frameaddress."); case MVT::i32: Opc = X86::MOV32rm; RC = &X86::GR32RegClass; break; case MVT::i64: Opc = X86::MOV64rm; RC = &X86::GR64RegClass; break; } // This needs to be set before we call getPtrSizedFrameRegister, otherwise // we get the wrong frame register. MachineFrameInfo *MFI = FuncInfo.MF->getFrameInfo(); MFI->setFrameAddressIsTaken(true); const X86RegisterInfo *RegInfo = static_cast( TM.getSubtargetImpl()->getRegisterInfo()); unsigned FrameReg = RegInfo->getPtrSizedFrameRegister(*(FuncInfo.MF)); assert(((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && "Invalid Frame Register!"); // Always make a copy of the frame register to to a vreg first, so that we // never directly reference the frame register (the TwoAddressInstruction- // Pass doesn't like that). unsigned SrcReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg); // Now recursively load from the frame address. // movq (%rbp), %rax // movq (%rax), %rax // movq (%rax), %rax // ... unsigned DestReg; unsigned Depth = cast(II->getOperand(0))->getZExtValue(); while (Depth--) { DestReg = createResultReg(RC); addDirectMem(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), DestReg), SrcReg); SrcReg = DestReg; } updateValueMap(II, SrcReg); return true; } case Intrinsic::memcpy: { const MemCpyInst *MCI = cast(II); // Don't handle volatile or variable length memcpys. if (MCI->isVolatile()) return false; if (isa(MCI->getLength())) { // Small memcpy's are common enough that we want to do them // without a call if possible. uint64_t Len = cast(MCI->getLength())->getZExtValue(); if (IsMemcpySmall(Len)) { X86AddressMode DestAM, SrcAM; if (!X86SelectAddress(MCI->getRawDest(), DestAM) || !X86SelectAddress(MCI->getRawSource(), SrcAM)) return false; TryEmitSmallMemcpy(DestAM, SrcAM, Len); return true; } } unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; if (!MCI->getLength()->getType()->isIntegerTy(SizeWidth)) return false; if (MCI->getSourceAddressSpace() > 255 || MCI->getDestAddressSpace() > 255) return false; return lowerCallTo(II, "memcpy", II->getNumArgOperands() - 2); } case Intrinsic::memset: { const MemSetInst *MSI = cast(II); if (MSI->isVolatile()) return false; unsigned SizeWidth = Subtarget->is64Bit() ? 64 : 32; if (!MSI->getLength()->getType()->isIntegerTy(SizeWidth)) return false; if (MSI->getDestAddressSpace() > 255) return false; return lowerCallTo(II, "memset", II->getNumArgOperands() - 2); } case Intrinsic::stackprotector: { // Emit code to store the stack guard onto the stack. EVT PtrTy = TLI.getPointerTy(); const Value *Op1 = II->getArgOperand(0); // The guard's value. const AllocaInst *Slot = cast(II->getArgOperand(1)); MFI.setStackProtectorIndex(FuncInfo.StaticAllocaMap[Slot]); // Grab the frame index. X86AddressMode AM; if (!X86SelectAddress(Slot, AM)) return false; if (!X86FastEmitStore(PtrTy, Op1, AM)) return false; return true; } case Intrinsic::dbg_declare: { const DbgDeclareInst *DI = cast(II); X86AddressMode AM; assert(DI->getAddress() && "Null address should be checked earlier!"); if (!X86SelectAddress(DI->getAddress(), AM)) return false; const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE); // FIXME may need to add RegState::Debug to any registers produced, // although ESP/EBP should be the only ones at the moment. addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II), AM) .addImm(0) .addMetadata(DI->getVariable()) .addMetadata(DI->getExpression()); return true; } case Intrinsic::trap: { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TRAP)); return true; } case Intrinsic::sqrt: { if (!Subtarget->hasSSE1()) return false; Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; // Unfortunately we can't use fastEmit_r, because the AVX version of FSQRT // is not generated by FastISel yet. // FIXME: Update this code once tablegen can handle it. static const unsigned SqrtOpc[2][2] = { {X86::SQRTSSr, X86::VSQRTSSr}, {X86::SQRTSDr, X86::VSQRTSDr} }; bool HasAVX = Subtarget->hasAVX(); unsigned Opc; const TargetRegisterClass *RC; switch (VT.SimpleTy) { default: return false; case MVT::f32: Opc = SqrtOpc[0][HasAVX]; RC = &X86::FR32RegClass; break; case MVT::f64: Opc = SqrtOpc[1][HasAVX]; RC = &X86::FR64RegClass; break; } const Value *SrcVal = II->getArgOperand(0); unsigned SrcReg = getRegForValue(SrcVal); if (SrcReg == 0) return false; unsigned ImplicitDefReg = 0; if (HasAVX) { ImplicitDefReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg); } unsigned ResultReg = createResultReg(RC); MachineInstrBuilder MIB; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); if (ImplicitDefReg) MIB.addReg(ImplicitDefReg); MIB.addReg(SrcReg); updateValueMap(II, ResultReg); return true; } case Intrinsic::sadd_with_overflow: case Intrinsic::uadd_with_overflow: case Intrinsic::ssub_with_overflow: case Intrinsic::usub_with_overflow: case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: { // This implements the basic lowering of the xalu with overflow intrinsics // into add/sub/mul followed by either seto or setb. const Function *Callee = II->getCalledFunction(); auto *Ty = cast(Callee->getReturnType()); Type *RetTy = Ty->getTypeAtIndex(0U); Type *CondTy = Ty->getTypeAtIndex(1); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; if (VT < MVT::i8 || VT > MVT::i64) return false; const Value *LHS = II->getArgOperand(0); const Value *RHS = II->getArgOperand(1); // Canonicalize immediate to the RHS. if (isa(LHS) && !isa(RHS) && isCommutativeIntrinsic(II)) std::swap(LHS, RHS); bool UseIncDec = false; if (isa(RHS) && cast(RHS)->isOne()) UseIncDec = true; unsigned BaseOpc, CondOpc; switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::sadd_with_overflow: BaseOpc = UseIncDec ? unsigned(X86ISD::INC) : unsigned(ISD::ADD); CondOpc = X86::SETOr; break; case Intrinsic::uadd_with_overflow: BaseOpc = ISD::ADD; CondOpc = X86::SETBr; break; case Intrinsic::ssub_with_overflow: BaseOpc = UseIncDec ? unsigned(X86ISD::DEC) : unsigned(ISD::SUB); CondOpc = X86::SETOr; break; case Intrinsic::usub_with_overflow: BaseOpc = ISD::SUB; CondOpc = X86::SETBr; break; case Intrinsic::smul_with_overflow: BaseOpc = X86ISD::SMUL; CondOpc = X86::SETOr; break; case Intrinsic::umul_with_overflow: BaseOpc = X86ISD::UMUL; CondOpc = X86::SETOr; break; } unsigned LHSReg = getRegForValue(LHS); if (LHSReg == 0) return false; bool LHSIsKill = hasTrivialKill(LHS); unsigned ResultReg = 0; // Check if we have an immediate version. if (const auto *CI = dyn_cast(RHS)) { static const unsigned Opc[2][4] = { { X86::INC8r, X86::INC16r, X86::INC32r, X86::INC64r }, { X86::DEC8r, X86::DEC16r, X86::DEC32r, X86::DEC64r } }; if (BaseOpc == X86ISD::INC || BaseOpc == X86ISD::DEC) { ResultReg = createResultReg(TLI.getRegClassFor(VT)); bool IsDec = BaseOpc == X86ISD::DEC; BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc[IsDec][VT.SimpleTy-MVT::i8]), ResultReg) .addReg(LHSReg, getKillRegState(LHSIsKill)); } else ResultReg = fastEmit_ri(VT, VT, BaseOpc, LHSReg, LHSIsKill, CI->getZExtValue()); } unsigned RHSReg; bool RHSIsKill; if (!ResultReg) { RHSReg = getRegForValue(RHS); if (RHSReg == 0) return false; RHSIsKill = hasTrivialKill(RHS); ResultReg = fastEmit_rr(VT, VT, BaseOpc, LHSReg, LHSIsKill, RHSReg, RHSIsKill); } // FastISel doesn't have a pattern for all X86::MUL*r and X86::IMUL*r. Emit // it manually. if (BaseOpc == X86ISD::UMUL && !ResultReg) { static const unsigned MULOpc[] = { X86::MUL8r, X86::MUL16r, X86::MUL32r, X86::MUL64r }; static const unsigned Reg[] = { X86::AL, X86::AX, X86::EAX, X86::RAX }; // First copy the first operand into RAX, which is an implicit input to // the X86::MUL*r instruction. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), Reg[VT.SimpleTy-MVT::i8]) .addReg(LHSReg, getKillRegState(LHSIsKill)); ResultReg = fastEmitInst_r(MULOpc[VT.SimpleTy-MVT::i8], TLI.getRegClassFor(VT), RHSReg, RHSIsKill); } else if (BaseOpc == X86ISD::SMUL && !ResultReg) { static const unsigned MULOpc[] = { X86::IMUL8r, X86::IMUL16rr, X86::IMUL32rr, X86::IMUL64rr }; if (VT == MVT::i8) { // Copy the first operand into AL, which is an implicit input to the // X86::IMUL8r instruction. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), X86::AL) .addReg(LHSReg, getKillRegState(LHSIsKill)); ResultReg = fastEmitInst_r(MULOpc[0], TLI.getRegClassFor(VT), RHSReg, RHSIsKill); } else ResultReg = fastEmitInst_rr(MULOpc[VT.SimpleTy-MVT::i8], TLI.getRegClassFor(VT), LHSReg, LHSIsKill, RHSReg, RHSIsKill); } if (!ResultReg) return false; unsigned ResultReg2 = FuncInfo.CreateRegs(CondTy); assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers."); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CondOpc), ResultReg2); updateValueMap(II, ResultReg, 2); return true; } case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: case Intrinsic::x86_sse2_cvttsd2si: case Intrinsic::x86_sse2_cvttsd2si64: { bool IsInputDouble; switch (II->getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic."); case Intrinsic::x86_sse_cvttss2si: case Intrinsic::x86_sse_cvttss2si64: if (!Subtarget->hasSSE1()) return false; IsInputDouble = false; break; case Intrinsic::x86_sse2_cvttsd2si: case Intrinsic::x86_sse2_cvttsd2si64: if (!Subtarget->hasSSE2()) return false; IsInputDouble = true; break; } Type *RetTy = II->getCalledFunction()->getReturnType(); MVT VT; if (!isTypeLegal(RetTy, VT)) return false; static const unsigned CvtOpc[2][2][2] = { { { X86::CVTTSS2SIrr, X86::VCVTTSS2SIrr }, { X86::CVTTSS2SI64rr, X86::VCVTTSS2SI64rr } }, { { X86::CVTTSD2SIrr, X86::VCVTTSD2SIrr }, { X86::CVTTSD2SI64rr, X86::VCVTTSD2SI64rr } } }; bool HasAVX = Subtarget->hasAVX(); unsigned Opc; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected result type."); case MVT::i32: Opc = CvtOpc[IsInputDouble][0][HasAVX]; break; case MVT::i64: Opc = CvtOpc[IsInputDouble][1][HasAVX]; break; } // Check if we can fold insertelement instructions into the convert. const Value *Op = II->getArgOperand(0); while (auto *IE = dyn_cast(Op)) { const Value *Index = IE->getOperand(2); if (!isa(Index)) break; unsigned Idx = cast(Index)->getZExtValue(); if (Idx == 0) { Op = IE->getOperand(1); break; } Op = IE->getOperand(0); } unsigned Reg = getRegForValue(Op); if (Reg == 0) return false; unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg) .addReg(Reg); updateValueMap(II, ResultReg); return true; } } } bool X86FastISel::fastLowerArguments() { if (!FuncInfo.CanLowerReturn) return false; const Function *F = FuncInfo.Fn; if (F->isVarArg()) return false; CallingConv::ID CC = F->getCallingConv(); if (CC != CallingConv::C) return false; if (Subtarget->isCallingConvWin64(CC)) return false; if (!Subtarget->is64Bit()) return false; // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. unsigned GPRCnt = 0; unsigned FPRCnt = 0; unsigned Idx = 0; for (auto const &Arg : F->args()) { // The first argument is at index 1. ++Idx; if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) || F->getAttributes().hasAttribute(Idx, Attribute::InReg) || F->getAttributes().hasAttribute(Idx, Attribute::StructRet) || F->getAttributes().hasAttribute(Idx, Attribute::Nest)) return false; Type *ArgTy = Arg.getType(); if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy()) return false; EVT ArgVT = TLI.getValueType(ArgTy); if (!ArgVT.isSimple()) return false; switch (ArgVT.getSimpleVT().SimpleTy) { default: return false; case MVT::i32: case MVT::i64: ++GPRCnt; break; case MVT::f32: case MVT::f64: if (!Subtarget->hasSSE1()) return false; ++FPRCnt; break; } if (GPRCnt > 6) return false; if (FPRCnt > 8) return false; } static const MCPhysReg GPR32ArgRegs[] = { X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D }; static const MCPhysReg GPR64ArgRegs[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9 }; static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned GPRIdx = 0; unsigned FPRIdx = 0; for (auto const &Arg : F->args()) { MVT VT = TLI.getSimpleValueType(Arg.getType()); const TargetRegisterClass *RC = TLI.getRegClassFor(VT); unsigned SrcReg; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type."); case MVT::i32: SrcReg = GPR32ArgRegs[GPRIdx++]; break; case MVT::i64: SrcReg = GPR64ArgRegs[GPRIdx++]; break; case MVT::f32: // fall-through case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break; } unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. // Without this, EmitLiveInCopies may eliminate the livein if its only // use is a bitcast (which isn't turned into an instruction). unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg) .addReg(DstReg, getKillRegState(true)); updateValueMap(&Arg, ResultReg); } return true; } static unsigned computeBytesPoppedByCallee(const X86Subtarget *Subtarget, CallingConv::ID CC, ImmutableCallSite *CS) { if (Subtarget->is64Bit()) return 0; if (Subtarget->getTargetTriple().isOSMSVCRT()) return 0; if (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::HiPE) return 0; if (CS && !CS->paramHasAttr(1, Attribute::StructRet)) return 0; if (CS && CS->paramHasAttr(1, Attribute::InReg)) return 0; return 4; } bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { auto &OutVals = CLI.OutVals; auto &OutFlags = CLI.OutFlags; auto &OutRegs = CLI.OutRegs; auto &Ins = CLI.Ins; auto &InRegs = CLI.InRegs; CallingConv::ID CC = CLI.CallConv; bool &IsTailCall = CLI.IsTailCall; bool IsVarArg = CLI.IsVarArg; const Value *Callee = CLI.Callee; const char *SymName = CLI.SymName; bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CC); // Handle only C, fastcc, and webkit_js calling conventions for now. switch (CC) { default: return false; case CallingConv::C: case CallingConv::Fast: case CallingConv::WebKit_JS: case CallingConv::X86_FastCall: case CallingConv::X86_64_Win64: case CallingConv::X86_64_SysV: break; } // Allow SelectionDAG isel to handle tail calls. if (IsTailCall) return false; // fastcc with -tailcallopt is intended to provide a guaranteed // tail call optimization. Fastisel doesn't know how to do that. if (CC == CallingConv::Fast && TM.Options.GuaranteedTailCallOpt) return false; // Don't know how to handle Win64 varargs yet. Nothing special needed for // x86-32. Special handling for x86-64 is implemented. if (IsVarArg && IsWin64) return false; // Don't know about inalloca yet. if (CLI.CS && CLI.CS->hasInAllocaArgument()) return false; // Fast-isel doesn't know about callee-pop yet. if (X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg, TM.Options.GuaranteedTailCallOpt)) return false; SmallVector OutVTs; SmallVector ArgRegs; // If this is a constant i1/i8/i16 argument, promote to i32 to avoid an extra // instruction. This is safe because it is common to all FastISel supported // calling conventions on x86. for (int i = 0, e = OutVals.size(); i != e; ++i) { Value *&Val = OutVals[i]; ISD::ArgFlagsTy Flags = OutFlags[i]; if (auto *CI = dyn_cast(Val)) { if (CI->getBitWidth() < 32) { if (Flags.isSExt()) Val = ConstantExpr::getSExt(CI, Type::getInt32Ty(CI->getContext())); else Val = ConstantExpr::getZExt(CI, Type::getInt32Ty(CI->getContext())); } } // Passing bools around ends up doing a trunc to i1 and passing it. // Codegen this as an argument + "and 1". MVT VT; auto *TI = dyn_cast(Val); unsigned ResultReg; if (TI && TI->getType()->isIntegerTy(1) && CLI.CS && (TI->getParent() == CLI.CS->getInstruction()->getParent()) && TI->hasOneUse()) { Value *PrevVal = TI->getOperand(0); ResultReg = getRegForValue(PrevVal); if (!ResultReg) return false; if (!isTypeLegal(PrevVal->getType(), VT)) return false; ResultReg = fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1); } else { if (!isTypeLegal(Val->getType(), VT)) return false; ResultReg = getRegForValue(Val); } if (!ResultReg) return false; ArgRegs.push_back(ResultReg); OutVTs.push_back(VT); } // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, CLI.RetTy->getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes); + .addImm(NumBytes).addImm(0); // Walk the register/memloc assignments, inserting copies/loads. const X86RegisterInfo *RegInfo = static_cast( TM.getSubtargetImpl()->getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign const &VA = ArgLocs[i]; const Value *ArgVal = OutVals[VA.getValNo()]; MVT ArgVT = OutVTs[VA.getValNo()]; if (ArgVT == MVT::x86mmx) return false; unsigned ArgReg = ArgRegs[VA.getValNo()]; // Promote the value if needed. switch (VA.getLocInfo()) { case CCValAssign::Full: break; case CCValAssign::SExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, ArgVT, ArgReg); assert(Emitted && "Failed to emit a sext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } case CCValAssign::ZExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, ArgVT, ArgReg); assert(Emitted && "Failed to emit a zext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } case CCValAssign::AExt: { assert(VA.getLocVT().isInteger() && !VA.getLocVT().isVector() && "Unexpected extend"); bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(), ArgReg, ArgVT, ArgReg); if (!Emitted) Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(), ArgReg, ArgVT, ArgReg); if (!Emitted) Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(), ArgReg, ArgVT, ArgReg); assert(Emitted && "Failed to emit a aext!"); (void)Emitted; ArgVT = VA.getLocVT(); break; } case CCValAssign::BCvt: { ArgReg = fastEmit_r(ArgVT, VA.getLocVT(), ISD::BITCAST, ArgReg, /*TODO: Kill=*/false); assert(ArgReg && "Failed to emit a bitcast!"); ArgVT = VA.getLocVT(); break; } case CCValAssign::VExt: // VExt has not been implemented, so this should be impossible to reach // for now. However, fallback to Selection DAG isel once implemented. return false; case CCValAssign::AExtUpper: case CCValAssign::SExtUpper: case CCValAssign::ZExtUpper: case CCValAssign::FPExt: llvm_unreachable("Unexpected loc info!"); case CCValAssign::Indirect: // FIXME: Indirect doesn't need extending, but fast-isel doesn't fully // support this. return false; } if (VA.isRegLoc()) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg); OutRegs.push_back(VA.getLocReg()); } else { assert(VA.isMemLoc()); // Don't emit stores for undef values. if (isa(ArgVal)) continue; unsigned LocMemOffset = VA.getLocMemOffset(); X86AddressMode AM; AM.Base.Reg = RegInfo->getStackRegister(); AM.Disp = LocMemOffset; ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()]; unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType()); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getStack(LocMemOffset), MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment); if (Flags.isByVal()) { X86AddressMode SrcAM; SrcAM.Base.Reg = ArgReg; if (!TryEmitSmallMemcpy(AM, SrcAM, Flags.getByValSize())) return false; } else if (isa(ArgVal) || isa(ArgVal)) { // If this is a really simple value, emit this with the Value* version // of X86FastEmitStore. If it isn't simple, we don't want to do this, // as it can cause us to reevaluate the argument. if (!X86FastEmitStore(ArgVT, ArgVal, AM, MMO)) return false; } else { bool ValIsKill = hasTrivialKill(ArgVal); if (!X86FastEmitStore(ArgVT, ArgReg, ValIsKill, AM, MMO)) return false; } } } // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (Subtarget->isPICStyleGOT()) { unsigned Base = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), X86::EBX).addReg(Base); } if (Is64Bit && IsVarArg && !IsWin64) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in // the declaration) %al is used as hidden argument to specify the number // of SSE registers used. The contents of %al do not need to match exactly // the number of registers, but must be an ubound on the number of SSE // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV8ri), X86::AL).addImm(NumXMMRegs); } // Materialize callee address in a register. FIXME: GV address can be // handled with a CALLpcrel32 instead. X86AddressMode CalleeAM; if (!X86SelectCallAddress(Callee, CalleeAM)) return false; unsigned CalleeOp = 0; const GlobalValue *GV = nullptr; if (CalleeAM.GV != nullptr) { GV = CalleeAM.GV; } else if (CalleeAM.Base.Reg != 0) { CalleeOp = CalleeAM.Base.Reg; } else return false; // Issue the call. MachineInstrBuilder MIB; if (CalleeOp) { // Register-indirect call. unsigned CallOpc = Is64Bit ? X86::CALL64r : X86::CALL32r; MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)) .addReg(CalleeOp); } else { // Direct call. assert(GV && "Not a direct call"); unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; // See if we need any target-specific flags on the GV operand. unsigned char OpFlags = 0; // On ELF targets, in both X86-64 and X86-32 mode, direct calls to // external symbols most go through the PLT in PIC mode. If the symbol // has hidden or protected visibility, or if it is static or local, then // we don't need to use the PLT - we can directly call it. if (Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_ && GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (GV->isDeclaration() || GV->isWeakForLinker()) && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; } MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); if (SymName) MIB.addExternalSymbol(SymName, OpFlags); else MIB.addGlobalAddress(GV, 0, OpFlags); } // Add a register mask operand representing the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). MIB.addRegMask(TRI.getCallPreservedMask(CC)); // Add an implicit use GOT pointer in EBX. if (Subtarget->isPICStyleGOT()) MIB.addReg(X86::EBX, RegState::Implicit); if (Is64Bit && IsVarArg && !IsWin64) MIB.addReg(X86::AL, RegState::Implicit); // Add implicit physical register uses to the call. for (auto Reg : OutRegs) MIB.addReg(Reg, RegState::Implicit); // Issue CALLSEQ_END unsigned NumBytesForCalleeToPop = computeBytesPoppedByCallee(Subtarget, CC, CLI.CS); unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp)) .addImm(NumBytes).addImm(NumBytesForCalleeToPop); // Now handle call return values. SmallVector RVLocs; CCState CCRetInfo(CC, IsVarArg, *FuncInfo.MF, RVLocs, CLI.RetTy->getContext()); CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy); for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getValVT(); unsigned CopyReg = ResultReg + i; // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && isScalarFPTypeInSSEReg(VA.getValVT())) { CopyVT = MVT::f80; CopyReg = createResultReg(&X86::RFP80RegClass); } // Copy out the result. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg()); InRegs.push_back(VA.getLocReg()); // Round the f80 to the right size, which also moves it to the appropriate // xmm register. This is accomplished by storing the f80 value in memory // and then loading it back. if (CopyVT != VA.getValVT()) { EVT ResVT = VA.getValVT(); unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64; unsigned MemSize = ResVT.getSizeInBits()/8; int FI = MFI.CreateStackObject(MemSize, MemSize, false); addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc)), FI) .addReg(CopyReg); Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm; addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg + i), FI); } } CLI.ResultReg = ResultReg; CLI.NumResultRegs = RVLocs.size(); CLI.Call = MIB; return true; } bool X86FastISel::fastSelectInstruction(const Instruction *I) { switch (I->getOpcode()) { default: break; case Instruction::Load: return X86SelectLoad(I); case Instruction::Store: return X86SelectStore(I); case Instruction::Ret: return X86SelectRet(I); case Instruction::ICmp: case Instruction::FCmp: return X86SelectCmp(I); case Instruction::ZExt: return X86SelectZExt(I); case Instruction::Br: return X86SelectBranch(I); case Instruction::LShr: case Instruction::AShr: case Instruction::Shl: return X86SelectShift(I); case Instruction::SDiv: case Instruction::UDiv: case Instruction::SRem: case Instruction::URem: return X86SelectDivRem(I); case Instruction::Select: return X86SelectSelect(I); case Instruction::Trunc: return X86SelectTrunc(I); case Instruction::FPExt: return X86SelectFPExt(I); case Instruction::FPTrunc: return X86SelectFPTrunc(I); case Instruction::IntToPtr: // Deliberate fall-through. case Instruction::PtrToInt: { EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType()); EVT DstVT = TLI.getValueType(I->getType()); if (DstVT.bitsGT(SrcVT)) return X86SelectZExt(I); if (DstVT.bitsLT(SrcVT)) return X86SelectTrunc(I); unsigned Reg = getRegForValue(I->getOperand(0)); if (Reg == 0) return false; updateValueMap(I, Reg); return true; } } return false; } unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { if (VT > MVT::i64) return 0; uint64_t Imm = CI->getZExtValue(); if (Imm == 0) { unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); case MVT::i1: case MVT::i8: return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, X86::sub_8bit); case MVT::i16: return fastEmitInst_extractsubreg(MVT::i16, SrcReg, /*Kill=*/true, X86::sub_16bit); case MVT::i32: return SrcReg; case MVT::i64: { unsigned ResultReg = createResultReg(&X86::GR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); return ResultReg; } } } unsigned Opc = 0; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); case MVT::i1: VT = MVT::i8; // fall-through case MVT::i8: Opc = X86::MOV8ri; break; case MVT::i16: Opc = X86::MOV16ri; break; case MVT::i32: Opc = X86::MOV32ri; break; case MVT::i64: { if (isUInt<32>(Imm)) Opc = X86::MOV32ri; else if (isInt<32>(Imm)) Opc = X86::MOV64ri32; else Opc = X86::MOV64ri; break; } } if (VT == MVT::i64 && Opc == X86::MOV32ri) { unsigned SrcReg = fastEmitInst_i(Opc, &X86::GR32RegClass, Imm); unsigned ResultReg = createResultReg(&X86::GR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); return ResultReg; } return fastEmitInst_i(Opc, TLI.getRegClassFor(VT), Imm); } unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { if (CFP->isNullValue()) return fastMaterializeFloatZero(CFP); // Can't handle alternate code models yet. CodeModel::Model CM = TM.getCodeModel(); if (CM != CodeModel::Small && CM != CodeModel::Large) return 0; // Get opcode and regclass of the output for the given load instruction. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; case MVT::f32: if (X86ScalarSSEf32) { Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; RC = &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; RC = &X86::RFP32RegClass; } break; case MVT::f64: if (X86ScalarSSEf64) { Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; RC = &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; RC = &X86::RFP64RegClass; } break; case MVT::f80: // No f80 support yet. return 0; } // MachineConstantPool wants an explicit alignment. unsigned Align = DL.getPrefTypeAlignment(CFP->getType()); if (Align == 0) { // Alignment of vector types. FIXME! Align = DL.getTypeAllocSize(CFP->getType()); } // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; unsigned char OpFlag = 0; if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic OpFlag = X86II::MO_PIC_BASE_OFFSET; PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); } else if (Subtarget->isPICStyleGOT()) { OpFlag = X86II::MO_GOTOFF; PICBase = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF); } else if (Subtarget->isPICStyleRIPRel() && TM.getCodeModel() == CodeModel::Small) { PICBase = X86::RIP; } // Create the load from the constant pool. unsigned CPI = MCP.getConstantPoolIndex(CFP, Align); unsigned ResultReg = createResultReg(RC); if (CM == CodeModel::Large) { unsigned AddrReg = createResultReg(&X86::GR64RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), AddrReg) .addConstantPoolIndex(CPI, 0, OpFlag); MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); addDirectMem(MIB, AddrReg); MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand( MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad, TM.getSubtargetImpl()->getDataLayout()->getPointerSize(), Align); MIB->addMemOperand(*FuncInfo.MF, MMO); return ResultReg; } addConstantPoolReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), CPI, PICBase, OpFlag); return ResultReg; } unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) { // Can't handle alternate code models yet. if (TM.getCodeModel() != CodeModel::Small) return 0; // Materialize addresses with LEA/MOV instructions. X86AddressMode AM; if (X86SelectAddress(GV, AM)) { // If the expression is just a basereg, then we're done, otherwise we need // to emit an LEA. if (AM.BaseType == X86AddressMode::RegBase && AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr) return AM.Base.Reg; unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT)); if (TM.getRelocationModel() == Reloc::Static && TLI.getPointerTy() == MVT::i64) { // The displacement code could be more than 32 bits away so we need to use // an instruction with a 64 bit immediate BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri), ResultReg) .addGlobalAddress(GV); } else { unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); } return ResultReg; } return 0; } unsigned X86FastISel::fastMaterializeConstant(const Constant *C) { EVT CEVT = TLI.getValueType(C->getType(), true); // Only handle simple types. if (!CEVT.isSimple()) return 0; MVT VT = CEVT.getSimpleVT(); if (const auto *CI = dyn_cast(C)) return X86MaterializeInt(CI, VT); else if (const ConstantFP *CFP = dyn_cast(C)) return X86MaterializeFP(CFP, VT); else if (const GlobalValue *GV = dyn_cast(C)) return X86MaterializeGV(GV, VT); return 0; } unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) { // Fail on dynamic allocas. At this point, getRegForValue has already // checked its CSE maps, so if we're here trying to handle a dynamic // alloca, we're not going to succeed. X86SelectAddress has a // check for dynamic allocas, because it's called directly from // various places, but targetMaterializeAlloca also needs a check // in order to avoid recursion between getRegForValue, // X86SelectAddrss, and targetMaterializeAlloca. if (!FuncInfo.StaticAllocaMap.count(C)) return 0; assert(C->isStaticAlloca() && "dynamic alloca in the static alloca map?"); X86AddressMode AM; if (!X86SelectAddress(C, AM)) return 0; unsigned Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r; const TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy()); unsigned ResultReg = createResultReg(RC); addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg), AM); return ResultReg; } unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { MVT VT; if (!isTypeLegal(CF->getType(), VT)) return 0; // Get opcode and regclass for the given zero. unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; case MVT::f32: if (X86ScalarSSEf32) { Opc = X86::FsFLD0SS; RC = &X86::FR32RegClass; } else { Opc = X86::LD_Fp032; RC = &X86::RFP32RegClass; } break; case MVT::f64: if (X86ScalarSSEf64) { Opc = X86::FsFLD0SD; RC = &X86::FR64RegClass; } else { Opc = X86::LD_Fp064; RC = &X86::RFP64RegClass; } break; case MVT::f80: // No f80 support yet. return 0; } unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg); return ResultReg; } bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const LoadInst *LI) { const Value *Ptr = LI->getPointerOperand(); X86AddressMode AM; if (!X86SelectAddress(Ptr, AM)) return false; const X86InstrInfo &XII = (const X86InstrInfo &)TII; unsigned Size = DL.getTypeAllocSize(LI->getType()); unsigned Alignment = LI->getAlignment(); if (Alignment == 0) // Ensure that codegen never sees alignment 0 Alignment = DL.getABITypeAlignment(LI->getType()); SmallVector AddrOps; AM.getFullAddress(AddrOps); MachineInstr *Result = XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment, /*AllowCommute=*/true); if (!Result) return false; Result->addMemOperand(*FuncInfo.MF, createMachineMemOperandFor(LI)); FuncInfo.MBB->insert(FuncInfo.InsertPt, Result); MI->eraseFromParent(); return true; } namespace llvm { FastISel *X86::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) { return new X86FastISel(funcInfo, libInfo); } } diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp index 16aab16d63ee..f2eb6a8ea73e 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -1,2034 +1,1951 @@ //===-- X86FrameLowering.cpp - X86 Frame Information ----------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file contains the X86 implementation of TargetFrameLowering class. // //===----------------------------------------------------------------------===// #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/Support/Debug.h" #include using namespace llvm; // FIXME: completely move here. extern cl::opt ForceStackAlign; bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const { - return !MF.getFrameInfo()->hasVarSizedObjects(); + return !MF.getFrameInfo()->hasVarSizedObjects() && + !MF.getInfo()->getHasPushSequences(); +} + +/// canSimplifyCallFramePseudos - If there is a reserved call frame, the +/// call frame pseudos can be simplified. Having a FP, as in the default +/// implementation, is not sufficient here since we can't always use it. +/// Use a more nuanced condition. +bool +X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const { + const X86RegisterInfo *TRI = static_cast + (MF.getSubtarget().getRegisterInfo()); + return hasReservedCallFrame(MF) || + (hasFP(MF) && !TRI->needsStackRealignment(MF)) + || TRI->hasBasePointer(MF); +} + +// needsFrameIndexResolution - Do we need to perform FI resolution for +// this function. Normally, this is required only when the function +// has any stack objects. However, FI resolution actually has another job, +// not apparent from the title - it resolves callframesetup/destroy +// that were not simplified earlier. +// So, this is required for x86 functions that have push sequences even +// when there are no stack objects. +bool +X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const { + return MF.getFrameInfo()->hasStackObjects() || + MF.getInfo()->getHasPushSequences(); } /// hasFP - Return true if the specified function should have a dedicated frame /// pointer register. This is true if the function has variable sized allocas /// or if frame pointer elimination is disabled. bool X86FrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineModuleInfo &MMI = MF.getMMI(); const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo(); return (MF.getTarget().Options.DisableFramePointerElim(MF) || RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken() || MFI->hasInlineAsmWithSPAdjust() || MF.getInfo()->getForceFramePointer() || MMI.callsUnwindInit() || MMI.callsEHReturn() || MFI->hasStackMap() || MFI->hasPatchPoint()); } static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::SUB64ri8; return X86::SUB64ri32; } else { if (isInt<8>(Imm)) return X86::SUB32ri8; return X86::SUB32ri; } } static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::ADD64ri8; return X86::ADD64ri32; } else { if (isInt<8>(Imm)) return X86::ADD32ri8; return X86::ADD32ri; } } static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) { if (IsLP64) { if (isInt<8>(Imm)) return X86::AND64ri8; return X86::AND64ri32; } if (isInt<8>(Imm)) return X86::AND32ri8; return X86::AND32ri; } -static unsigned getPUSHiOpcode(bool IsLP64, MachineOperand MO) { - // We don't support LP64 for now. - assert(!IsLP64); - - if (MO.isImm() && isInt<8>(MO.getImm())) - return X86::PUSH32i8; - - return X86::PUSHi32;; -} - static unsigned getLEArOpcode(unsigned IsLP64) { return IsLP64 ? X86::LEA64r : X86::LEA32r; } /// findDeadCallerSavedReg - Return a caller-saved register that isn't live /// when it reaches the "return" instruction. We can then pop a stack object /// to this register without worry about clobbering it. static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, const TargetRegisterInfo &TRI, bool Is64Bit) { const MachineFunction *MF = MBB.getParent(); const Function *F = MF->getFunction(); if (!F || MF->getMMI().callsEHReturn()) return 0; static const uint16_t CallerSavedRegs32Bit[] = { X86::EAX, X86::EDX, X86::ECX, 0 }; static const uint16_t CallerSavedRegs64Bit[] = { X86::RAX, X86::RDX, X86::RCX, X86::RSI, X86::RDI, X86::R8, X86::R9, X86::R10, X86::R11, 0 }; unsigned Opc = MBBI->getOpcode(); switch (Opc) { default: return 0; case X86::RETL: case X86::RETQ: case X86::RETIL: case X86::RETIQ: case X86::TCRETURNdi: case X86::TCRETURNri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: case X86::EH_RETURN: case X86::EH_RETURN64: { SmallSet Uses; for (unsigned i = 0, e = MBBI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MBBI->getOperand(i); if (!MO.isReg() || MO.isDef()) continue; unsigned Reg = MO.getReg(); if (!Reg) continue; for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI) Uses.insert(*AI); } const uint16_t *CS = Is64Bit ? CallerSavedRegs64Bit : CallerSavedRegs32Bit; for (; *CS; ++CS) if (!Uses.count(*CS)) return *CS; } } return 0; } /// emitSPUpdate - Emit a series of instructions to increment / decrement the /// stack pointer by a constant value. static void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, unsigned StackPtr, int64_t NumBytes, bool Is64BitTarget, bool Is64BitStackPtr, bool UseLEA, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) { bool isSub = NumBytes < 0; uint64_t Offset = isSub ? -NumBytes : NumBytes; unsigned Opc; if (UseLEA) Opc = getLEArOpcode(Is64BitStackPtr); else Opc = isSub ? getSUBriOpcode(Is64BitStackPtr, Offset) : getADDriOpcode(Is64BitStackPtr, Offset); uint64_t Chunk = (1LL << 31) - 1; DebugLoc DL = MBB.findDebugLoc(MBBI); while (Offset) { uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset; if (ThisVal == (Is64BitTarget ? 8 : 4)) { // Use push / pop instead. unsigned Reg = isSub ? (unsigned)(Is64BitTarget ? X86::RAX : X86::EAX) : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64BitTarget); if (Reg) { Opc = isSub ? (Is64BitTarget ? X86::PUSH64r : X86::PUSH32r) : (Is64BitTarget ? X86::POP64r : X86::POP32r); MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc)) .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub)); if (isSub) MI->setFlag(MachineInstr::FrameSetup); Offset -= ThisVal; continue; } } MachineInstr *MI = nullptr; if (UseLEA) { MI = addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), StackPtr, false, isSub ? -ThisVal : ThisVal); } else { MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(StackPtr) .addImm(ThisVal); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. } if (isSub) MI->setFlag(MachineInstr::FrameSetup); Offset -= ThisVal; } } /// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator. static void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, unsigned StackPtr, uint64_t *NumBytes = nullptr) { if (MBBI == MBB.begin()) return; MachineBasicBlock::iterator PI = std::prev(MBBI); unsigned Opc = PI->getOpcode(); if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || Opc == X86::LEA32r || Opc == X86::LEA64_32r) && PI->getOperand(0).getReg() == StackPtr) { if (NumBytes) *NumBytes += PI->getOperand(2).getImm(); MBB.erase(PI); } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && PI->getOperand(0).getReg() == StackPtr) { if (NumBytes) *NumBytes -= PI->getOperand(2).getImm(); MBB.erase(PI); } } /// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower /// iterator. static void mergeSPUpdatesDown(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, unsigned StackPtr, uint64_t *NumBytes = nullptr) { // FIXME: THIS ISN'T RUN!!! return; if (MBBI == MBB.end()) return; MachineBasicBlock::iterator NI = std::next(MBBI); if (NI == MBB.end()) return; unsigned Opc = NI->getOpcode(); if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || Opc == X86::ADD32ri || Opc == X86::ADD32ri8) && NI->getOperand(0).getReg() == StackPtr) { if (NumBytes) *NumBytes -= NI->getOperand(2).getImm(); MBB.erase(NI); MBBI = NI; } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && NI->getOperand(0).getReg() == StackPtr) { if (NumBytes) *NumBytes += NI->getOperand(2).getImm(); MBB.erase(NI); MBBI = NI; } } /// mergeSPUpdates - Checks the instruction before/after the passed /// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and /// the stack adjustment is returned as a positive value for ADD/LEA and a /// negative for SUB. static int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI, unsigned StackPtr, bool doMergeWithPrevious) { if ((doMergeWithPrevious && MBBI == MBB.begin()) || (!doMergeWithPrevious && MBBI == MBB.end())) return 0; MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI; MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr : std::next(MBBI); unsigned Opc = PI->getOpcode(); int Offset = 0; if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 || Opc == X86::ADD32ri || Opc == X86::ADD32ri8 || Opc == X86::LEA32r || Opc == X86::LEA64_32r) && PI->getOperand(0).getReg() == StackPtr){ Offset += PI->getOperand(2).getImm(); MBB.erase(PI); if (!doMergeWithPrevious) MBBI = NI; } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 || Opc == X86::SUB32ri || Opc == X86::SUB32ri8) && PI->getOperand(0).getReg() == StackPtr) { Offset -= PI->getOperand(2).getImm(); MBB.erase(PI); if (!doMergeWithPrevious) MBBI = NI; } return Offset; } static bool isEAXLiveIn(MachineFunction &MF) { for (MachineRegisterInfo::livein_iterator II = MF.getRegInfo().livein_begin(), EE = MF.getRegInfo().livein_end(); II != EE; ++II) { unsigned Reg = II->first; if (Reg == X86::EAX || Reg == X86::AX || Reg == X86::AH || Reg == X86::AL) return true; } return false; } void X86FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo *MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); // Add callee saved registers to move list. const std::vector &CSI = MFI->getCalleeSavedInfo(); if (CSI.empty()) return; // Calculate offsets. for (std::vector::const_iterator I = CSI.begin(), E = CSI.end(); I != E; ++I) { int64_t Offset = MFI->getObjectOffset(I->getFrameIdx()); unsigned Reg = I->getReg(); unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } } /// usesTheStack - This function checks if any of the users of EFLAGS /// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has /// to use the stack, and if we don't adjust the stack we clobber the first /// frame index. /// See X86InstrInfo::copyPhysReg. static bool usesTheStack(const MachineFunction &MF) { const MachineRegisterInfo &MRI = MF.getRegInfo(); for (MachineRegisterInfo::reg_instr_iterator ri = MRI.reg_instr_begin(X86::EFLAGS), re = MRI.reg_instr_end(); ri != re; ++ri) if (ri->isCopy()) return true; return false; } void X86FrameLowering::getStackProbeFunction(const X86Subtarget &STI, unsigned &CallOp, const char *&Symbol) { CallOp = STI.is64Bit() ? X86::W64ALLOCA : X86::CALLpcrel32; if (STI.is64Bit()) { if (STI.isTargetCygMing()) { Symbol = "___chkstk_ms"; } else { Symbol = "__chkstk"; } } else if (STI.isTargetCygMing()) Symbol = "_alloca"; else Symbol = "_chkstk"; } /// emitPrologue - Push callee-saved registers onto the stack, which /// automatically adjust the stack pointer. Adjust the stack pointer to allocate /// space for local variables. Also emit labels used by the exception handler to /// generate the exception handling frames. /* Here's a gist of what gets emitted: ; Establish frame pointer, if needed [if needs FP] push %rbp .cfi_def_cfa_offset 16 .cfi_offset %rbp, -16 .seh_pushreg %rpb mov %rsp, %rbp .cfi_def_cfa_register %rbp ; Spill general-purpose registers [for all callee-saved GPRs] pushq % [if not needs FP] .cfi_def_cfa_offset (offset from RETADDR) .seh_pushreg % ; If the required stack alignment > default stack alignment ; rsp needs to be re-aligned. This creates a "re-alignment gap" ; of unknown size in the stack frame. [if stack needs re-alignment] and $MASK, %rsp ; Allocate space for locals [if target is Windows and allocated space > 4096 bytes] ; Windows needs special care for allocations larger ; than one page. mov $NNN, %rax call ___chkstk_ms/___chkstk sub %rax, %rsp [else] sub $NNN, %rsp [if needs FP] .seh_stackalloc (size of XMM spill slots) .seh_setframe %rbp, SEHFrameOffset ; = size of all spill slots [else] .seh_stackalloc NNN ; Spill XMMs ; Note, that while only Windows 64 ABI specifies XMMs as callee-preserved, ; they may get spilled on any platform, if the current function ; calls @llvm.eh.unwind.init [if needs FP] [for all callee-saved XMM registers] movaps %, -MMM(%rbp) [for all callee-saved XMM registers] .seh_savexmm %, (-MMM + SEHFrameOffset) ; i.e. the offset relative to (%rbp - SEHFrameOffset) [else] [for all callee-saved XMM registers] movaps %, KKK(%rsp) [for all callee-saved XMM registers] .seh_savexmm %, KKK .seh_endprologue [if needs base pointer] mov %rsp, %rbx [if needs to restore base pointer] mov %rsp, -MMM(%rbp) ; Emit CFI info [if needs FP] [for all callee-saved registers] .cfi_offset %, (offset from %rbp) [else] .cfi_def_cfa_offset (offset from RETADDR) [for all callee-saved registers] .cfi_offset %, (offset from %rsp) Notes: - .seh directives are emitted only for Windows 64 ABI - .cfi directives are emitted for all other ABIs - for 32-bit code, substitute %e?? registers for %r?? */ void X86FrameLowering::emitPrologue(MachineFunction &MF) const { MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB. MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *Fn = MF.getFunction(); const X86RegisterInfo *RegInfo = static_cast(MF.getSubtarget().getRegisterInfo()); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo(); uint64_t MaxAlign = MFI->getMaxAlignment(); // Desired stack alignment. uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate. bool HasFP = hasFP(MF); const X86Subtarget &STI = MF.getTarget().getSubtarget(); bool Is64Bit = STI.is64Bit(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); bool IsWin64 = STI.isTargetWin64(); // Not necessarily synonymous with IsWin64. bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinEH = IsWinEH && Fn->needsUnwindTableEntry(); bool NeedsDwarfCFI = !IsWinEH && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); bool UseLEA = STI.useLeaForSP(); unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); const unsigned MachineFramePtr = STI.isTarget64BitILP32() ? getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); unsigned BasePtr = RegInfo->getBaseRegister(); DebugLoc DL; // If we're forcing a stack realignment we can't rely on just the frame // info, we need to know the ABI stack alignment as well in case we // have a call out. Otherwise just make sure we have some alignment - we'll // go with the minimum SlotSize. if (ForceStackAlign) { if (MFI->hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; else if (MaxAlign < SlotSize) MaxAlign = SlotSize; } // Add RETADDR move area to callee saved frame size. int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); bool UseStackProbe = (STI.isOSWindows() && !STI.isTargetMachO()); // The default stack probe size is 4096 if the function has no stackprobesize // attribute. unsigned StackProbeSize = 4096; if (Fn->hasFnAttribute("stack-probe-size")) Fn->getFnAttribute("stack-probe-size") .getValueAsString() .getAsInteger(0, StackProbeSize); // If this is x86-64 and the Red Zone is not disabled, if we are a leaf // function, and use up to 128 bytes of stack space, don't have a frame // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoRedZone) && !RegInfo->needsStackRealignment(MF) && !MFI->hasVarSizedObjects() && // No dynamic alloca. !MFI->adjustsStack() && // No calls. !IsWin64 && // Win64 has no Red Zone !usesTheStack(MF) && // Don't push and pop. !MF.shouldSplitStack()) { // Regular stack uint64_t MinSize = X86FI->getCalleeSavedFrameSize(); if (HasFP) MinSize += SlotSize; StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI->setStackSize(StackSize); } // Insert stack pointer adjustment for later moving of return addr. Only // applies to tail call optimized functions where the callee argument stack // size is bigger than the callers. if (TailCallReturnAddrDelta < 0) { MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(getSUBriOpcode(Uses64BitFramePtr, -TailCallReturnAddrDelta)), StackPtr) .addReg(StackPtr) .addImm(-TailCallReturnAddrDelta) .setMIFlag(MachineInstr::FrameSetup); MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead. } // Mapping for machine moves: // // DST: VirtualFP AND // SRC: VirtualFP => DW_CFA_def_cfa_offset // ELSE => DW_CFA_def_cfa // // SRC: VirtualFP AND // DST: Register => DW_CFA_def_cfa_register // // ELSE // OFFSET < 0 => DW_CFA_offset_extended_sf // REG < 64 => DW_CFA_offset + Reg // ELSE => DW_CFA_offset_extended uint64_t NumBytes = 0; int stackGrowth = -SlotSize; if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; // If required, include space for extra hidden slot for stashing base pointer. if (X86FI->getRestoreBasePointer()) FrameSize += SlotSize; if (RegInfo->needsStackRealignment(MF)) { // Callee-saved registers are pushed on stack before the stack // is realigned. FrameSize -= X86FI->getCalleeSavedFrameSize(); NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; } else { NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize(); } // Get the offset of the stack slot for the EBP register, which is // guaranteed to be the last slot by processFunctionBeforeFrameFinalized. // Update the frame offset adjustment. MFI->setOffsetAdjustment(-NumBytes); // Save EBP/RBP into the appropriate stack slot. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r)) .addReg(MachineFramePtr, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI) { // Mark the place where EBP/RBP was saved. // Define the current CFA rule to use the provided offset. assert(StackSize); unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); // Change the rule for the FramePtr to be an "offset" rule. unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); CFIIndex = MMI.addFrameInst( MCCFIInstruction::createOffset(nullptr, DwarfFramePtr, 2 * stackGrowth)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } if (NeedsWinEH) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) .addImm(FramePtr) .setMIFlag(MachineInstr::FrameSetup); } // Update EBP with the new base value. BuildMI(MBB, MBBI, DL, TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), FramePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); if (NeedsDwarfCFI) { // Mark effective beginning of when frame pointer becomes valid. // Define the current CFA to use the EBP/RBP register. unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(MachineFramePtr, true); unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } // Mark the FramePtr as live-in in every block. for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) I->addLiveIn(MachineFramePtr); } else { NumBytes = StackSize - X86FI->getCalleeSavedFrameSize(); } // Skip the callee-saved push instructions. bool PushedRegs = false; int StackOffset = 2 * stackGrowth; while (MBBI != MBB.end() && (MBBI->getOpcode() == X86::PUSH32r || MBBI->getOpcode() == X86::PUSH64r)) { PushedRegs = true; unsigned Reg = MBBI->getOperand(0).getReg(); ++MBBI; if (!HasFP && NeedsDwarfCFI) { // Mark callee-saved push instruction. // Define the current CFA rule to use the provided offset. assert(StackSize); unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); StackOffset += stackGrowth; } if (NeedsWinEH) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( MachineInstr::FrameSetup); } } // Realign stack after we pushed callee-saved registers (so that we'll be // able to calculate their offsets from the frame pointer). if (RegInfo->needsStackRealignment(MF)) { assert(HasFP && "There should be a frame pointer if stack is realigned."); uint64_t Val = -MaxAlign; MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(getANDriOpcode(Uses64BitFramePtr, Val)), StackPtr) .addReg(StackPtr) .addImm(Val) .setMIFlag(MachineInstr::FrameSetup); // The EFLAGS implicit def is dead. MI->getOperand(3).setIsDead(); } // If there is an SUB32ri of ESP immediately before this instruction, merge // the two. This can be the case when tail call elimination is enabled and // the callee has more arguments then the caller. NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true); // If there is an ADD32ri or SUB32ri of ESP immediately after this // instruction, merge the two instructions. mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes); // Adjust stack pointer: ESP -= numbytes. // Windows and cygwin/mingw require a prologue helper routine when allocating // more than 4K bytes on the stack. Windows uses __chkstk and cygwin/mingw // uses __alloca. __alloca and the 32-bit version of __chkstk will probe the // stack and adjust the stack pointer in one go. The 64-bit version of // __chkstk is only responsible for probing the stack. The 64-bit prologue is // responsible for adjusting the stack pointer. Touching the stack at 4K // increments is necessary to ensure that the guard pages used by the OS // virtual memory manager are allocated in correct sequence. if (NumBytes >= StackProbeSize && UseStackProbe) { const char *StackProbeSymbol; unsigned CallOp; getStackProbeFunction(STI, CallOp, StackProbeSymbol); // Check whether EAX is livein for this function. bool isEAXAlive = isEAXLiveIn(MF); if (isEAXAlive) { // Sanity check that EAX is not livein for this function. // It should not be, so throw an assert. assert(!Is64Bit && "EAX is livein in x64 case!"); // Save EAX BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r)) .addReg(X86::EAX, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); } if (Is64Bit) { // Handle the 64-bit Windows ABI case where we need to call __chkstk. // Function prologue is responsible for adjusting the stack pointer. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64ri), X86::RAX) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); } else { // Allocate NumBytes-4 bytes on stack in case of isEAXAlive. // We'll also use 4 already allocated bytes for EAX. BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) .addImm(isEAXAlive ? NumBytes - 4 : NumBytes) .setMIFlag(MachineInstr::FrameSetup); } BuildMI(MBB, MBBI, DL, TII.get(CallOp)) .addExternalSymbol(StackProbeSymbol) .addReg(StackPtr, RegState::Define | RegState::Implicit) .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit) .setMIFlag(MachineInstr::FrameSetup); if (Is64Bit) { // MSVC x64's __chkstk and cygwin/mingw's ___chkstk_ms do not adjust %rsp // themself. It also does not clobber %rax so we can reuse it when // adjusting %rsp. BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr) .addReg(StackPtr) .addReg(X86::RAX) .setMIFlag(MachineInstr::FrameSetup); } if (isEAXAlive) { // Restore EAX MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm), X86::EAX), StackPtr, false, NumBytes - 4); MI->setFlag(MachineInstr::FrameSetup); MBB.insert(MBBI, MI); } } else if (NumBytes) { emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); } int SEHFrameOffset = 0; if (NeedsWinEH) { if (HasFP) { // We need to set frame base offset low enough such that all saved // register offsets would be positive relative to it, but we can't // just use NumBytes, because .seh_setframe offset must be <=240. // So we pretend to have only allocated enough space to spill the // non-volatile registers. // We don't care about the rest of stack allocation, because unwinder // will restore SP to (BP - SEHFrameOffset) for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { int offset = MFI->getObjectOffset(Info.getFrameIdx()); SEHFrameOffset = std::max(SEHFrameOffset, std::abs(offset)); } SEHFrameOffset += SEHFrameOffset % 16; // ensure alignmant // This only needs to account for XMM spill slots, GPR slots // are covered by the .seh_pushreg's emitted above. unsigned Size = SEHFrameOffset - X86FI->getCalleeSavedFrameSize(); if (Size) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) .addImm(Size) .setMIFlag(MachineInstr::FrameSetup); } BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) .addImm(SEHFrameOffset) .setMIFlag(MachineInstr::FrameSetup); } else { // SP will be the base register for restoring XMMs if (NumBytes) { BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_StackAlloc)) .addImm(NumBytes) .setMIFlag(MachineInstr::FrameSetup); } } } // Skip the rest of register spilling code while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup)) ++MBBI; // Emit SEH info for non-GPRs if (NeedsWinEH) { for (const CalleeSavedInfo &Info : MFI->getCalleeSavedInfo()) { unsigned Reg = Info.getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; assert(X86::FR64RegClass.contains(Reg) && "Unexpected register class"); int Offset = getFrameIndexOffset(MF, Info.getFrameIdx()); Offset += SEHFrameOffset; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) .addImm(Reg) .addImm(Offset) .setMIFlag(MachineInstr::FrameSetup); } BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_EndPrologue)) .setMIFlag(MachineInstr::FrameSetup); } // If we need a base pointer, set it up here. It's whatever the value // of the stack pointer is at this point. Any variable size objects // will be allocated after this, so we can still use the base pointer // to reference locals. if (RegInfo->hasBasePointer(MF)) { // Update the base pointer with the current stack pointer. unsigned Opc = Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr; BuildMI(MBB, MBBI, DL, TII.get(Opc), BasePtr) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); if (X86FI->getRestoreBasePointer()) { // Stash value of base pointer. Saving RSP instead of EBP shortens dependence chain. unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr; addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opm)), FramePtr, true, X86FI->getRestoreBasePointerOffset()) .addReg(StackPtr) .setMIFlag(MachineInstr::FrameSetup); } } if (((!HasFP && NumBytes) || PushedRegs) && NeedsDwarfCFI) { // Mark end of stack pointer adjustment. if (!HasFP && NumBytes) { // Define the current CFA rule to use the provided offset. assert(StackSize); unsigned CFIIndex = MMI.addFrameInst( MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize + stackGrowth)); BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION)) .addCFIIndex(CFIIndex); } // Emit DWARF info specifying the offsets of the callee-saved registers. if (PushedRegs) emitCalleeSavedFrameMoves(MBB, MBBI, DL); } } void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo(); const X86RegisterInfo *RegInfo = static_cast(MF.getSubtarget().getRegisterInfo()); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr(); assert(MBBI != MBB.end() && "Returning block has no instructions"); unsigned RetOpcode = MBBI->getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); const X86Subtarget &STI = MF.getTarget().getSubtarget(); bool Is64Bit = STI.is64Bit(); // standard x86_64 and NaCl use 64-bit frame/stack pointers, x32 - 32-bit. const bool Uses64BitFramePtr = STI.isTarget64BitLP64() || STI.isTargetNaCl64(); const bool Is64BitILP32 = STI.isTarget64BitILP32(); bool UseLEA = STI.useLeaForSP(); unsigned StackAlign = getStackAlignment(); unsigned SlotSize = RegInfo->getSlotSize(); unsigned FramePtr = RegInfo->getFrameRegister(MF); unsigned MachineFramePtr = Is64BitILP32 ? getX86SubSuperRegister(FramePtr, MVT::i64, false) : FramePtr; unsigned StackPtr = RegInfo->getStackRegister(); bool IsWinEH = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsWinEH = IsWinEH && MF.getFunction()->needsUnwindTableEntry(); switch (RetOpcode) { default: llvm_unreachable("Can only insert epilog into returning blocks"); case X86::RETQ: case X86::RETL: case X86::RETIL: case X86::RETIQ: case X86::TCRETURNdi: case X86::TCRETURNri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: case X86::EH_RETURN: case X86::EH_RETURN64: break; // These are ok } // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI->getStackSize(); uint64_t MaxAlign = MFI->getMaxAlignment(); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); uint64_t NumBytes = 0; // If we're forcing a stack realignment we can't rely on just the frame // info, we need to know the ABI stack alignment as well in case we // have a call out. Otherwise just make sure we have some alignment - we'll // go with the minimum. if (ForceStackAlign) { if (MFI->hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; else MaxAlign = MaxAlign ? MaxAlign : 4; } if (hasFP(MF)) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; if (RegInfo->needsStackRealignment(MF)) { // Callee-saved registers were pushed on stack before the stack // was realigned. FrameSize -= CSSize; NumBytes = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign; } else { NumBytes = FrameSize - CSSize; } // Pop EBP. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr); } else { NumBytes = StackSize - CSSize; } // Skip the callee-saved pop instructions. while (MBBI != MBB.begin()) { MachineBasicBlock::iterator PI = std::prev(MBBI); unsigned Opc = PI->getOpcode(); if (Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::DBG_VALUE && !PI->isTerminator()) break; --MBBI; } MachineBasicBlock::iterator FirstCSPop = MBBI; DL = MBBI->getDebugLoc(); // If there is an ADD32ri or SUB32ri of ESP immediately before this // instruction, merge the two instructions. if (NumBytes || MFI->hasVarSizedObjects()) mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes); // If dynamic alloca is used, then reset esp to point to the last callee-saved // slot before popping them off! Same applies for the case, when stack was // realigned. if (RegInfo->needsStackRealignment(MF) || MFI->hasVarSizedObjects()) { if (RegInfo->needsStackRealignment(MF)) MBBI = FirstCSPop; if (CSSize != 0) { unsigned Opc = getLEArOpcode(Uses64BitFramePtr); addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr), FramePtr, false, -CSSize); --MBBI; } else { unsigned Opc = (Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr); BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr) .addReg(FramePtr); --MBBI; } } else if (NumBytes) { // Adjust stack pointer back: ESP += numbytes. emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); --MBBI; } // Windows unwinder will not invoke function's exception handler if IP is // either in prologue or in epilogue. This behavior causes a problem when a // call immediately precedes an epilogue, because the return address points // into the epilogue. To cope with that, we insert an epilogue marker here, // then replace it with a 'nop' if it ends up immediately after a CALL in the // final emitted code. if (NeedsWinEH) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); // We're returning from function via eh_return. if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) { MBBI = MBB.getLastNonDebugInstr(); MachineOperand &DestAddr = MBBI->getOperand(0); assert(DestAddr.isReg() && "Offset should be in register!"); BuildMI(MBB, MBBI, DL, TII.get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr).addReg(DestAddr.getReg()); } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 || RetOpcode == X86::TCRETURNmi64) { bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64; // Tail call return: adjust the stack pointer and jump to callee. MBBI = MBB.getLastNonDebugInstr(); MachineOperand &JumpTarget = MBBI->getOperand(0); MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1); assert(StackAdjust.isImm() && "Expecting immediate value."); // Adjust stack pointer. int StackAdj = StackAdjust.getImm(); int MaxTCDelta = X86FI->getTCReturnAddrDelta(); int Offset = 0; assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); // Incoporate the retaddr area. Offset = StackAdj-MaxTCDelta; assert(Offset >= 0 && "Offset should never be negative"); if (Offset) { // Check for possible merge with preceding ADD instruction. Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true); emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); } // Jump to label or value in register. if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi) ? X86::TAILJMPd : X86::TAILJMPd64)); if (JumpTarget.isGlobal()) MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), JumpTarget.getTargetFlags()); else { assert(JumpTarget.isSymbol()); MIB.addExternalSymbol(JumpTarget.getSymbolName(), JumpTarget.getTargetFlags()); } } else if (RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64) { MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNmi) ? X86::TAILJMPm : X86::TAILJMPm64)); for (unsigned i = 0; i != 5; ++i) MIB.addOperand(MBBI->getOperand(i)); } else if (RetOpcode == X86::TCRETURNri64) { BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)). addReg(JumpTarget.getReg(), RegState::Kill); } else { BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)). addReg(JumpTarget.getReg(), RegState::Kill); } MachineInstr *NewMI = std::prev(MBBI); NewMI->copyImplicitOps(MF, MBBI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); } else if ((RetOpcode == X86::RETQ || RetOpcode == X86::RETL || RetOpcode == X86::RETIQ || RetOpcode == X86::RETIL) && (X86FI->getTCReturnAddrDelta() < 0)) { // Add the return addr area delta back since we are not tail calling. int delta = -1*X86FI->getTCReturnAddrDelta(); MBBI = MBB.getLastNonDebugInstr(); // Check for possible merge with preceding ADD instruction. delta += mergeSPUpdates(MBB, MBBI, StackPtr, true); emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, Uses64BitFramePtr, UseLEA, TII, *RegInfo); } } int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const { const X86RegisterInfo *RegInfo = static_cast(MF.getSubtarget().getRegisterInfo()); const MachineFrameInfo *MFI = MF.getFrameInfo(); int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); uint64_t StackSize = MFI->getStackSize(); if (RegInfo->hasBasePointer(MF)) { assert (hasFP(MF) && "VLAs and dynamic stack realign, but no FP?!"); if (FI < 0) { // Skip the saved EBP. return Offset + RegInfo->getSlotSize(); } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; } } else if (RegInfo->needsStackRealignment(MF)) { if (FI < 0) { // Skip the saved EBP. return Offset + RegInfo->getSlotSize(); } else { assert((-(Offset + StackSize)) % MFI->getObjectAlignment(FI) == 0); return Offset + StackSize; } // FIXME: Support tail calls } else { if (!hasFP(MF)) return Offset + StackSize; // Skip the saved EBP. Offset += RegInfo->getSlotSize(); // Skip the RETADDR move area const X86MachineFunctionInfo *X86FI = MF.getInfo(); int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) Offset -= TailCallReturnAddrDelta; } return Offset; } int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const X86RegisterInfo *RegInfo = static_cast(MF.getSubtarget().getRegisterInfo()); // We can't calculate offset from frame pointer if the stack is realigned, // so enforce usage of stack/base pointer. The base pointer is used when we // have dynamic allocas in addition to dynamic realignment. if (RegInfo->hasBasePointer(MF)) FrameReg = RegInfo->getBaseRegister(); else if (RegInfo->needsStackRealignment(MF)) FrameReg = RegInfo->getStackRegister(); else FrameReg = RegInfo->getFrameRegister(MF); return getFrameIndexOffset(MF, FI); } // Simplified from getFrameIndexOffset keeping only StackPointer cases int X86FrameLowering::getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); // Does not include any dynamic realign. const uint64_t StackSize = MFI->getStackSize(); { #ifndef NDEBUG const X86RegisterInfo *RegInfo = static_cast(MF.getSubtarget().getRegisterInfo()); // Note: LLVM arranges the stack as: // Args > Saved RetPC (<--FP) > CSRs > dynamic alignment (<--BP) // > "Stack Slots" (<--SP) // We can always address StackSlots from RSP. We can usually (unless // needsStackRealignment) address CSRs from RSP, but sometimes need to // address them from RBP. FixedObjects can be placed anywhere in the stack // frame depending on their specific requirements (i.e. we can actually // refer to arguments to the function which are stored in the *callers* // frame). As a result, THE RESULT OF THIS CALL IS MEANINGLESS FOR CSRs // AND FixedObjects IFF needsStackRealignment or hasVarSizedObject. assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); // We don't handle tail calls, and shouldn't be seeing them // either. int TailCallReturnAddrDelta = MF.getInfo()->getTCReturnAddrDelta(); assert(!(TailCallReturnAddrDelta < 0) && "we don't handle this case!"); #endif } // This is how the math works out: // // %rsp grows (i.e. gets lower) left to right. Each box below is // one word (eight bytes). Obj0 is the stack slot we're trying to // get to. // // ---------------------------------- // | BP | Obj0 | Obj1 | ... | ObjN | // ---------------------------------- // ^ ^ ^ ^ // A B C E // // A is the incoming stack pointer. // (B - A) is the local area offset (-8 for x86-64) [1] // (C - A) is the Offset returned by MFI->getObjectOffset for Obj0 [2] // // |(E - B)| is the StackSize (absolute value, positive). For a // stack that grown down, this works out to be (B - E). [3] // // E is also the value of %rsp after stack has been set up, and we // want (C - E) -- the value we can add to %rsp to get to Obj0. Now // (C - E) == (C - A) - (B - A) + (B - E) // { Using [1], [2] and [3] above } // == getObjectOffset - LocalAreaOffset + StackSize // // Get the Offset from the StackPointer int Offset = MFI->getObjectOffset(FI) - getOffsetOfLocalArea(); return Offset + StackSize; } // Simplified from getFrameIndexReference keeping only StackPointer cases int X86FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, unsigned &FrameReg) const { const X86RegisterInfo *RegInfo = static_cast(MF.getSubtarget().getRegisterInfo()); assert(!RegInfo->hasBasePointer(MF) && "we don't handle this case"); FrameReg = RegInfo->getStackRegister(); return getFrameIndexOffsetFromSP(MF, FI); } bool X86FrameLowering::assignCalleeSavedSpillSlots( MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector &CSI) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const X86RegisterInfo *RegInfo = static_cast(MF.getSubtarget().getRegisterInfo()); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo(); unsigned CalleeSavedFrameSize = 0; int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta(); if (hasFP(MF)) { // emitPrologue always spills frame register the first thing. SpillSlotOffset -= SlotSize; MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); // Since emitPrologue and emitEpilogue will handle spilling and restoring of // the frame register, we can delete it from CSI list and not have to worry // about avoiding it later. unsigned FPReg = RegInfo->getFrameRegister(MF); for (unsigned i = 0; i < CSI.size(); ++i) { if (TRI->regsOverlap(CSI[i].getReg(),FPReg)) { CSI.erase(CSI.begin() + i); break; } } } // Assign slots for GPRs. It increases frame size. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; SpillSlotOffset -= SlotSize; CalleeSavedFrameSize += SlotSize; int SlotIndex = MFI->CreateFixedSpillStackObject(SlotSize, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); } X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize); // Assign slots for XMMs. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg); // ensure alignment SpillSlotOffset -= std::abs(SpillSlotOffset) % RC->getAlignment(); // spill into slot SpillSlotOffset -= RC->getSize(); int SlotIndex = MFI->CreateFixedSpillStackObject(RC->getSize(), SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); MFI->ensureMaxAlignment(RC->getAlignment()); } return true; } bool X86FrameLowering::spillCalleeSavedRegisters( MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const X86Subtarget &STI = MF.getTarget().getSubtarget(); // Push GPRs. It increases frame size. unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r; for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i - 1].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); BuildMI(MBB, MI, DL, TII.get(Opc)).addReg(Reg, RegState::Kill) .setMIFlag(MachineInstr::FrameSetup); } // Make XMM regs spilled. X86 does not have ability of push/pop XMM. // It can be done by spilling XMMs to stack frame. for (unsigned i = CSI.size(); i != 0; --i) { unsigned Reg = CSI[i-1].getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; // Add the callee-saved register as live-in. It's killed at the spill. MBB.addLiveIn(Reg); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.storeRegToStackSlot(MBB, MI, Reg, true, CSI[i - 1].getFrameIdx(), RC, TRI); --MI; MI->setFlag(MachineInstr::FrameSetup); ++MI; } return true; } bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const { if (CSI.empty()) return false; DebugLoc DL = MBB.findDebugLoc(MI); MachineFunction &MF = *MBB.getParent(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const X86Subtarget &STI = MF.getTarget().getSubtarget(); // Reload XMMs from stack frame. for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg)) continue; const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); TII.loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RC, TRI); } // POP GPRs. unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r; for (unsigned i = 0, e = CSI.size(); i != e; ++i) { unsigned Reg = CSI[i].getReg(); if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg)) continue; BuildMI(MBB, MI, DL, TII.get(Opc), Reg); } return true; } void X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo *MFI = MF.getFrameInfo(); const X86RegisterInfo *RegInfo = static_cast(MF.getSubtarget().getRegisterInfo()); unsigned SlotSize = RegInfo->getSlotSize(); X86MachineFunctionInfo *X86FI = MF.getInfo(); int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta(); if (TailCallReturnAddrDelta < 0) { // create RETURNADDR area // arg // arg // RETADDR // { ... // RETADDR area // ... // } // [EBP] MFI->CreateFixedObject(-TailCallReturnAddrDelta, TailCallReturnAddrDelta - SlotSize, true); } // Spill the BasePtr if it's used. if (RegInfo->hasBasePointer(MF)) MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister()); } static bool HasNestArgument(const MachineFunction *MF) { const Function *F = MF->getFunction(); for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; I++) { if (I->hasNestAttr()) return true; } return false; } /// GetScratchRegister - Get a temp register for performing work in the /// segmented stack and the Erlang/HiPE stack prologue. Depending on platform /// and the properties of the function either one or two registers will be /// needed. Set primary to true for the first register, false for the second. static unsigned GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); // Erlang stuff. if (CallingConvention == CallingConv::HiPE) { if (Is64Bit) return Primary ? X86::R14 : X86::R13; else return Primary ? X86::EBX : X86::EDI; } if (Is64Bit) { if (IsLP64) return Primary ? X86::R11 : X86::R12; else return Primary ? X86::R11D : X86::R12D; } bool IsNested = HasNestArgument(&MF); if (CallingConvention == CallingConv::X86_FastCall || CallingConvention == CallingConv::Fast) { if (IsNested) report_fatal_error("Segmented stacks does not support fastcall with " "nested function."); return Primary ? X86::EAX : X86::ECX; } if (IsNested) return Primary ? X86::EDX : X86::EAX; return Primary ? X86::ECX : X86::EAX; } // The stack limit in the TCB is set to this many bytes above the actual stack // limit. static const uint64_t kSplitStackAvailable = 256; void X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const { MachineBasicBlock &prologueMBB = MF.front(); MachineFrameInfo *MFI = MF.getFrameInfo(); const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); uint64_t StackSize; const X86Subtarget &STI = MF.getTarget().getSubtarget(); bool Is64Bit = STI.is64Bit(); const bool IsLP64 = STI.isTarget64BitLP64(); unsigned TlsReg, TlsOffset; DebugLoc DL; unsigned ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "Scratch register is live-in"); if (MF.getFunction()->isVarArg()) report_fatal_error("Segmented stacks do not support vararg functions."); if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD() && !STI.isTargetDragonFly()) report_fatal_error("Segmented stacks not supported on this platform."); // Eventually StackSize will be calculated by a link-time pass; which will // also decide whether checking code needs to be injected into this particular // prologue. StackSize = MFI->getStackSize(); // Do not generate a prologue for functions with a stack of size zero if (StackSize == 0) return; MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock(); X86MachineFunctionInfo *X86FI = MF.getInfo(); bool IsNested = false; // We need to know if the function has a nest argument only in 64 bit mode. if (Is64Bit) IsNested = HasNestArgument(&MF); // The MOV R10, RAX needs to be in a different block, since the RET we emit in // allocMBB needs to be last (terminating) instruction. for (MachineBasicBlock::livein_iterator i = prologueMBB.livein_begin(), e = prologueMBB.livein_end(); i != e; i++) { allocMBB->addLiveIn(*i); checkMBB->addLiveIn(*i); } if (IsNested) allocMBB->addLiveIn(IsLP64 ? X86::R10 : X86::R10D); MF.push_front(allocMBB); MF.push_front(checkMBB); // When the frame size is less than 256 we just compare the stack // boundary directly to the value of the stack pointer, per gcc. bool CompareStackPointer = StackSize < kSplitStackAvailable; // Read the limit off the current stacklet off the stack_guard location. if (Is64Bit) { if (STI.isTargetLinux()) { TlsReg = X86::FS; TlsOffset = IsLP64 ? 0x70 : 0x40; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90. } else if (STI.isTargetWin64()) { TlsReg = X86::GS; TlsOffset = 0x28; // pvArbitrary, reserved for application use } else if (STI.isTargetFreeBSD()) { TlsReg = X86::FS; TlsOffset = 0x18; } else if (STI.isTargetDragonFly()) { TlsReg = X86::FS; TlsOffset = 0x20; // use tls_tcb.tcb_segstack } else { report_fatal_error("Segmented stacks not supported on this platform."); } if (CompareStackPointer) ScratchReg = IsLP64 ? X86::RSP : X86::ESP; else BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::LEA64r : X86::LEA64_32r), ScratchReg).addReg(X86::RSP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); BuildMI(checkMBB, DL, TII.get(IsLP64 ? X86::CMP64rm : X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else { if (STI.isTargetLinux()) { TlsReg = X86::GS; TlsOffset = 0x30; } else if (STI.isTargetDarwin()) { TlsReg = X86::GS; TlsOffset = 0x48 + 90*4; } else if (STI.isTargetWin32()) { TlsReg = X86::FS; TlsOffset = 0x14; // pvArbitrary, reserved for application use } else if (STI.isTargetDragonFly()) { TlsReg = X86::FS; TlsOffset = 0x10; // use tls_tcb.tcb_segstack } else if (STI.isTargetFreeBSD()) { report_fatal_error("Segmented stacks not supported on FreeBSD i386."); } else { report_fatal_error("Segmented stacks not supported on this platform."); } if (CompareStackPointer) ScratchReg = X86::ESP; else BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP) .addImm(1).addReg(0).addImm(-StackSize).addReg(0); if (STI.isTargetLinux() || STI.isTargetWin32() || STI.isTargetWin64() || STI.isTargetDragonFly()) { BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg) .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg); } else if (STI.isTargetDarwin()) { // TlsOffset doesn't fit into a mod r/m byte so we need an extra register. unsigned ScratchReg2; bool SaveScratch2; if (CompareStackPointer) { // The primary scratch register is available for holding the TLS offset. ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, true); SaveScratch2 = false; } else { // Need to use a second register to hold the TLS offset ScratchReg2 = GetScratchRegister(Is64Bit, IsLP64, MF, false); // Unfortunately, with fastcc the second scratch register may hold an // argument. SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2); } // If Scratch2 is live-in then it needs to be saved. assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) && "Scratch register is live-in and not saved"); if (SaveScratch2) BuildMI(checkMBB, DL, TII.get(X86::PUSH32r)) .addReg(ScratchReg2, RegState::Kill); BuildMI(checkMBB, DL, TII.get(X86::MOV32ri), ScratchReg2) .addImm(TlsOffset); BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)) .addReg(ScratchReg) .addReg(ScratchReg2).addImm(1).addReg(0) .addImm(0) .addReg(TlsReg); if (SaveScratch2) BuildMI(checkMBB, DL, TII.get(X86::POP32r), ScratchReg2); } } // This jump is taken if SP >= (Stacklet Limit + Stack Space required). // It jumps to normal execution of the function body. BuildMI(checkMBB, DL, TII.get(X86::JA_1)).addMBB(&prologueMBB); // On 32 bit we first push the arguments size and then the frame size. On 64 // bit, we pass the stack frame size in r10 and the argument size in r11. if (Is64Bit) { // Functions with nested arguments use R10, so it needs to be saved across // the call to _morestack const unsigned RegAX = IsLP64 ? X86::RAX : X86::EAX; const unsigned Reg10 = IsLP64 ? X86::R10 : X86::R10D; const unsigned Reg11 = IsLP64 ? X86::R11 : X86::R11D; const unsigned MOVrr = IsLP64 ? X86::MOV64rr : X86::MOV32rr; const unsigned MOVri = IsLP64 ? X86::MOV64ri : X86::MOV32ri; if (IsNested) BuildMI(allocMBB, DL, TII.get(MOVrr), RegAX).addReg(Reg10); BuildMI(allocMBB, DL, TII.get(MOVri), Reg10) .addImm(StackSize); BuildMI(allocMBB, DL, TII.get(MOVri), Reg11) .addImm(X86FI->getArgumentStackSize()); MF.getRegInfo().setPhysRegUsed(Reg10); MF.getRegInfo().setPhysRegUsed(Reg11); } else { BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(X86FI->getArgumentStackSize()); BuildMI(allocMBB, DL, TII.get(X86::PUSHi32)) .addImm(StackSize); } // __morestack is in libgcc if (Is64Bit && MF.getTarget().getCodeModel() == CodeModel::Large) { // Under the large code model, we cannot assume that __morestack lives // within 2^31 bytes of the call site, so we cannot use pc-relative // addressing. We cannot perform the call via a temporary register, // as the rax register may be used to store the static chain, and all // other suitable registers may be either callee-save or used for // parameter passing. We cannot use the stack at this point either // because __morestack manipulates the stack directly. // // To avoid these issues, perform an indirect call via a read-only memory // location containing the address. // // This solution is not perfect, as it assumes that the .rodata section // is laid out within 2^31 bytes of each function body, but this seems // to be sufficient for JIT. BuildMI(allocMBB, DL, TII.get(X86::CALL64m)) .addReg(X86::RIP) .addImm(0) .addReg(0) .addExternalSymbol("__morestack_addr") .addReg(0); MF.getMMI().setUsesMorestackAddr(true); } else { if (Is64Bit) BuildMI(allocMBB, DL, TII.get(X86::CALL64pcrel32)) .addExternalSymbol("__morestack"); else BuildMI(allocMBB, DL, TII.get(X86::CALLpcrel32)) .addExternalSymbol("__morestack"); } if (IsNested) BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET_RESTORE_R10)); else BuildMI(allocMBB, DL, TII.get(X86::MORESTACK_RET)); allocMBB->addSuccessor(&prologueMBB); checkMBB->addSuccessor(allocMBB); checkMBB->addSuccessor(&prologueMBB); #ifdef XDEBUG MF.verify(); #endif } /// Erlang programs may need a special prologue to handle the stack size they /// might need at runtime. That is because Erlang/OTP does not implement a C /// stack but uses a custom implementation of hybrid stack/heap architecture. /// (for more information see Eric Stenman's Ph.D. thesis: /// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf) /// /// CheckStack: /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart /// OldStart: /// ... /// IncStack: /// call inc_stack # doubles the stack space /// temp0 = sp - MaxStack /// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const { const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); MachineFrameInfo *MFI = MF.getFrameInfo(); const unsigned SlotSize = static_cast(MF.getSubtarget().getRegisterInfo()) ->getSlotSize(); const X86Subtarget &STI = MF.getTarget().getSubtarget(); const bool Is64Bit = STI.is64Bit(); const bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL; // HiPE-specific values const unsigned HipeLeafWords = 24; const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; const unsigned Guaranteed = HipeLeafWords * SlotSize; unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ? MF.getFunction()->arg_size() - CCRegisteredArgs : 0; unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize; assert(STI.isTargetLinux() && "HiPE prologue is only supported on Linux operating systems."); // Compute the largest caller's frame that is needed to fit the callees' // frames. This 'MaxStack' is computed from: // // a) the fixed frame size, which is the space needed for all spilled temps, // b) outgoing on-stack parameter areas, and // c) the minimum stack space this function needs to make available for the // functions it calls (a tunable ABI property). if (MFI->hasCalls()) { unsigned MoreStackForCalls = 0; for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end(); MBBI != MBBE; ++MBBI) for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end(); MI != ME; ++MI) { if (!MI->isCall()) continue; // Get callee operand. const MachineOperand &MO = MI->getOperand(0); // Only take account of global function calls (no closures etc.). if (!MO.isGlobal()) continue; const Function *F = dyn_cast(MO.getGlobal()); if (!F) continue; // Do not update 'MaxStack' for primitive and built-in functions // (encoded with names either starting with "erlang."/"bif_" or not // having a ".", such as a simple .., or an // "_", such as the BIF "suspend_0") as they are executed on another // stack. if (F->getName().find("erlang.") != StringRef::npos || F->getName().find("bif_") != StringRef::npos || F->getName().find_first_of("._") == StringRef::npos) continue; unsigned CalleeStkArity = F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0; if (HipeLeafWords - 1 > CalleeStkArity) MoreStackForCalls = std::max(MoreStackForCalls, (HipeLeafWords - 1 - CalleeStkArity) * SlotSize); } MaxStack += MoreStackForCalls; } // If the stack frame needed is larger than the guaranteed then runtime checks // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue. if (MaxStack > Guaranteed) { MachineBasicBlock &prologueMBB = MF.front(); MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock(); MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock(); for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(), E = prologueMBB.livein_end(); I != E; I++) { stackCheckMBB->addLiveIn(*I); incStackMBB->addLiveIn(*I); } MF.push_front(incStackMBB); MF.push_front(stackCheckMBB); unsigned ScratchReg, SPReg, PReg, SPLimitOffset; unsigned LEAop, CMPop, CALLop; if (Is64Bit) { SPReg = X86::RSP; PReg = X86::RBP; LEAop = X86::LEA64r; CMPop = X86::CMP64rm; CALLop = X86::CALL64pcrel32; SPLimitOffset = 0x90; } else { SPReg = X86::ESP; PReg = X86::EBP; LEAop = X86::LEA32r; CMPop = X86::CMP32rm; CALLop = X86::CALLpcrel32; SPLimitOffset = 0x4c; } ScratchReg = GetScratchRegister(Is64Bit, IsLP64, MF, true); assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "HiPE prologue scratch register is live-in"); // Create new MBB for StackCheck: addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg), SPReg, false, -MaxStack); // SPLimitOffset is in a fixed heap location (pointed by BP). addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_1)).addMBB(&prologueMBB); // Create new MBB for IncStack: BuildMI(incStackMBB, DL, TII.get(CALLop)). addExternalSymbol("inc_stack_0"); addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg), SPReg, false, -MaxStack); addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop)) .addReg(ScratchReg), PReg, false, SPLimitOffset); BuildMI(incStackMBB, DL, TII.get(X86::JLE_1)).addMBB(incStackMBB); stackCheckMBB->addSuccessor(&prologueMBB, 99); stackCheckMBB->addSuccessor(incStackMBB, 1); incStackMBB->addSuccessor(&prologueMBB, 99); incStackMBB->addSuccessor(incStackMBB, 1); } #ifdef XDEBUG MF.verify(); #endif } -bool X86FrameLowering:: -convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, uint64_t Amount) const { - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - const X86RegisterInfo &RegInfo = *static_cast( - MF.getSubtarget().getRegisterInfo()); - unsigned StackPtr = RegInfo.getStackRegister(); - - // Scan the call setup sequence for the pattern we're looking for. - // We only handle a simple case now - a sequence of MOV32mi or MOV32mr - // instructions, that push a sequence of 32-bit values onto the stack, with - // no gaps. - std::map MovMap; - do { - int Opcode = I->getOpcode(); - if (Opcode != X86::MOV32mi && Opcode != X86::MOV32mr) - break; - - // We only want movs of the form: - // movl imm/r32, k(%ecx) - // If we run into something else, bail - // Note that AddrBaseReg may, counterintuitively, not be a register... - if (!I->getOperand(X86::AddrBaseReg).isReg() || - (I->getOperand(X86::AddrBaseReg).getReg() != StackPtr) || - !I->getOperand(X86::AddrScaleAmt).isImm() || - (I->getOperand(X86::AddrScaleAmt).getImm() != 1) || - (I->getOperand(X86::AddrIndexReg).getReg() != X86::NoRegister) || - (I->getOperand(X86::AddrSegmentReg).getReg() != X86::NoRegister) || - !I->getOperand(X86::AddrDisp).isImm()) - return false; - - int64_t StackDisp = I->getOperand(X86::AddrDisp).getImm(); - - // We don't want to consider the unaligned case. - if (StackDisp % 4) - return false; - - // If the same stack slot is being filled twice, something's fishy. - if (!MovMap.insert(std::pair(StackDisp, I)).second) - return false; - - ++I; - } while (I != MBB.end()); - - // We now expect the end of the sequence - a call and a stack adjust. - if (I == MBB.end()) - return false; - if (!I->isCall()) - return false; - MachineBasicBlock::iterator Call = I; - if ((++I)->getOpcode() != TII.getCallFrameDestroyOpcode()) - return false; - - // Now, go through the map, and see that we don't have any gaps, - // but only a series of 32-bit MOVs. - // Since std::map provides ordered iteration, the original order - // of the MOVs doesn't matter. - int64_t ExpectedDist = 0; - for (auto MMI = MovMap.begin(), MME = MovMap.end(); MMI != MME; - ++MMI, ExpectedDist += 4) - if (MMI->first != ExpectedDist) - return false; - - // Ok, everything looks fine. Do the transformation. - DebugLoc DL = I->getDebugLoc(); - - // It's possible the original stack adjustment amount was larger than - // that done by the pushes. If so, we still need a SUB. - Amount -= ExpectedDist; - if (Amount) { - MachineInstr* Sub = BuildMI(MBB, Call, DL, - TII.get(getSUBriOpcode(false, Amount)), StackPtr) - .addReg(StackPtr).addImm(Amount); - Sub->getOperand(3).setIsDead(); - } - - // Now, iterate through the map in reverse order, and replace the movs - // with pushes. MOVmi/MOVmr doesn't have any defs, so need to replace uses. - for (auto MMI = MovMap.rbegin(), MME = MovMap.rend(); MMI != MME; ++MMI) { - MachineBasicBlock::iterator MOV = MMI->second; - MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); - - // Replace MOVmr with PUSH32r, and MOVmi with PUSHi of appropriate size - int PushOpcode = X86::PUSH32r; - if (MOV->getOpcode() == X86::MOV32mi) - PushOpcode = getPUSHiOpcode(false, PushOp); - - BuildMI(MBB, Call, DL, TII.get(PushOpcode)).addOperand(PushOp); - MBB.erase(MOV); - } - - return true; -} - void X86FrameLowering:: eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); const X86RegisterInfo &RegInfo = *static_cast( MF.getSubtarget().getRegisterInfo()); unsigned StackPtr = RegInfo.getStackRegister(); bool reserveCallFrame = hasReservedCallFrame(MF); int Opcode = I->getOpcode(); bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode(); const X86Subtarget &STI = MF.getTarget().getSubtarget(); bool IsLP64 = STI.isTarget64BitLP64(); DebugLoc DL = I->getDebugLoc(); uint64_t Amount = !reserveCallFrame ? I->getOperand(0).getImm() : 0; - uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0; + uint64_t InternalAmt = (isDestroy || Amount) ? I->getOperand(1).getImm() : 0; I = MBB.erase(I); if (!reserveCallFrame) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, ' and the // adjcallstackdown instruction into 'add ESP, ' if (Amount == 0) return; // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. unsigned StackAlign = MF.getTarget() .getSubtargetImpl() ->getFrameLowering() ->getStackAlignment(); Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign; MachineInstr *New = nullptr; - if (Opcode == TII.getCallFrameSetupOpcode()) { - // Try to convert movs to the stack into pushes. - // We currently only look for a pattern that appears in 32-bit - // calling conventions. - if (!IsLP64 && convertArgMovsToPushes(MF, MBB, I, Amount)) - return; - - New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), - StackPtr) - .addReg(StackPtr) - .addImm(Amount); - } else { - assert(Opcode == TII.getCallFrameDestroyOpcode()); - // Factor out the amount the callee already popped. - Amount -= CalleeAmt; + // Factor out the amount that gets handled inside the sequence + // (Pushes of argument for frame setup, callee pops for frame destroy) + Amount -= InternalAmt; + + if (Amount) { + if (Opcode == TII.getCallFrameSetupOpcode()) { + New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)), StackPtr) + .addReg(StackPtr).addImm(Amount); + } else { + assert(Opcode == TII.getCallFrameDestroyOpcode()); - if (Amount) { unsigned Opc = getADDriOpcode(IsLP64, Amount); New = BuildMI(MF, DL, TII.get(Opc), StackPtr) .addReg(StackPtr).addImm(Amount); } } if (New) { // The EFLAGS implicit def is dead. New->getOperand(3).setIsDead(); // Replace the pseudo instruction with a new instruction. MBB.insert(I, New); } return; } - if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) { + if (Opcode == TII.getCallFrameDestroyOpcode() && InternalAmt) { // If we are performing frame pointer elimination and if the callee pops // something off the stack pointer, add it back. We do this until we have // more advanced stack pointer tracking ability. - unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt); + unsigned Opc = getSUBriOpcode(IsLP64, InternalAmt); MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr) - .addReg(StackPtr).addImm(CalleeAmt); + .addReg(StackPtr).addImm(InternalAmt); // The EFLAGS implicit def is dead. New->getOperand(3).setIsDead(); // We are not tracking the stack pointer adjustment by the callee, so make // sure we restore the stack pointer immediately after the call, there may // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. MachineBasicBlock::iterator B = MBB.begin(); while (I != B && !std::prev(I)->isCall()) --I; MBB.insert(I, New); } } diff --git a/contrib/llvm/lib/Target/X86/X86FrameLowering.h b/contrib/llvm/lib/Target/X86/X86FrameLowering.h index ee0ee227cad8..9cb887ac112d 100644 --- a/contrib/llvm/lib/Target/X86/X86FrameLowering.h +++ b/contrib/llvm/lib/Target/X86/X86FrameLowering.h @@ -1,93 +1,95 @@ //===-- X86TargetFrameLowering.h - Define frame lowering for X86 -*- C++ -*-==// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This class implements X86-specific bits of TargetFrameLowering class. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H #include "llvm/Target/TargetFrameLowering.h" namespace llvm { class MCSymbol; class X86TargetMachine; class X86Subtarget; class X86FrameLowering : public TargetFrameLowering { public: explicit X86FrameLowering(StackDirection D, unsigned StackAl, int LAO) : TargetFrameLowering(StackGrowsDown, StackAl, LAO) {} static void getStackProbeFunction(const X86Subtarget &STI, unsigned &CallOp, const char *&Symbol); void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc DL) const; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. void emitPrologue(MachineFunction &MF) const override; void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override; void adjustForSegmentedStacks(MachineFunction &MF) const override; void adjustForHiPEPrologue(MachineFunction &MF) const override; void processFunctionBeforeCalleeSavedScan(MachineFunction &MF, RegScavenger *RS = nullptr) const override; bool assignCalleeSavedSpillSlots(MachineFunction &MF, const TargetRegisterInfo *TRI, std::vector &CSI) const override; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const override; bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const std::vector &CSI, const TargetRegisterInfo *TRI) const override; bool hasFP(const MachineFunction &MF) const override; bool hasReservedCallFrame(const MachineFunction &MF) const override; + bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override; + bool needsFrameIndexResolution(const MachineFunction &MF) const override; int getFrameIndexOffset(const MachineFunction &MF, int FI) const override; int getFrameIndexReference(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; int getFrameIndexOffsetFromSP(const MachineFunction &MF, int FI) const; int getFrameIndexReferenceFromSP(const MachineFunction &MF, int FI, unsigned &FrameReg) const override; void eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; private: /// convertArgMovsToPushes - This method tries to convert a call sequence /// that uses sub and mov instructions to put the argument onto the stack /// into a series of pushes. /// Returns true if the transformation succeeded, false if not. bool convertArgMovsToPushes(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, uint64_t Amount) const; }; } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td index ed0a6346929b..880c982982a1 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/contrib/llvm/lib/Target/X86/X86InstrCompiler.td @@ -1,1848 +1,1852 @@ //===- X86InstrCompiler.td - Compiler Pseudos and Patterns -*- tablegen -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file describes the various pseudo instructions used by the compiler, // as well as Pat patterns used during instruction selection. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Pattern Matching Support def GetLo32XForm : SDNodeXFormgetZExtValue()); }]>; def GetLo8XForm : SDNodeXFormgetZExtValue()); }]>; //===----------------------------------------------------------------------===// // Random Pseudo Instructions. // PIC base construction. This expands to code that looks like this: // call $next_inst // popl %destreg" let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label), "", []>; // ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into // a stack adjustment and the codegen must know that they may modify the stack // pointer before prolog-epilog rewriting occurs. // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [ESP, EFLAGS], Uses = [ESP] in { -def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt), +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKDOWN", - [(X86callseq_start timm:$amt)]>, + []>, Requires<[NotLP64]>; def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP", [(X86callseq_end timm:$amt1, timm:$amt2)]>, Requires<[NotLP64]>; } +def : Pat<(X86callseq_start timm:$amt1), + (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>; + // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into // a stack adjustment and the codegen must know that they may modify the stack // pointer before prolog-epilog rewriting occurs. // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [RSP, EFLAGS], Uses = [RSP] in { -def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt), +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKDOWN", - [(X86callseq_start timm:$amt)]>, + []>, Requires<[IsLP64]>; def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP", [(X86callseq_end timm:$amt1, timm:$amt2)]>, Requires<[IsLP64]>; } - +def : Pat<(X86callseq_start timm:$amt1), + (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>; // x86-64 va_start lowering magic. let usesCustomInserter = 1, Defs = [EFLAGS] in { def VASTART_SAVE_XMM_REGS : I<0, Pseudo, (outs), (ins GR8:$al, i64imm:$regsavefi, i64imm:$offset, variable_ops), "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset", [(X86vastart_save_xmm_regs GR8:$al, imm:$regsavefi, imm:$offset), (implicit EFLAGS)]>; // The VAARG_64 pseudo-instruction takes the address of the va_list, // and places the address of the next argument into a register. let Defs = [EFLAGS] in def VAARG_64 : I<0, Pseudo, (outs GR64:$dst), (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align), "#VAARG_64 $dst, $ap, $size, $mode, $align", [(set GR64:$dst, (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)), (implicit EFLAGS)]>; // Dynamic stack allocation yields a _chkstk or _alloca call for all Windows // targets. These calls are needed to probe the stack when allocating more than // 4k bytes in one go. Touching the stack at 4K increments is necessary to // ensure that the guard pages used by the OS virtual memory manager are // allocated in correct sequence. // The main point of having separate instruction are extra unmodelled effects // (compared to ordinary calls) like stack pointer change. let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in def WIN_ALLOCA : I<0, Pseudo, (outs), (ins), "# dynamic stack allocation", [(X86WinAlloca)]>; // When using segmented stacks these are lowered into instructions which first // check if the current stacklet has enough free memory. If it does, memory is // allocated by bumping the stack pointer. Otherwise memory is allocated from // the heap. let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in def SEG_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size), "# variable sized alloca for segmented stacks", [(set GR32:$dst, (X86SegAlloca GR32:$size))]>, Requires<[NotLP64]>; let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size), "# variable sized alloca for segmented stacks", [(set GR64:$dst, (X86SegAlloca GR64:$size))]>, Requires<[In64BitMode]>; } // The MSVC runtime contains an _ftol2 routine for converting floating-point // to integer values. It has a strange calling convention: the input is // popped from the x87 stack, and the return value is given in EDX:EAX. ECX is // used as a temporary register. No other registers (aside from flags) are // touched. // Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80 // variant is unnecessary. let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in { def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src), "# win32 fptoui", [(X86WinFTOL RFP32:$src)]>, Requires<[Not64BitMode]>; def WIN_FTOL_64 : I<0, Pseudo, (outs), (ins RFP64:$src), "# win32 fptoui", [(X86WinFTOL RFP64:$src)]>, Requires<[Not64BitMode]>; } //===----------------------------------------------------------------------===// // EH Pseudo Instructions // let SchedRW = [WriteSystem] in { let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in { def EH_RETURN : I<0xC3, RawFrm, (outs), (ins GR32:$addr), "ret\t#eh_return, addr: $addr", [(X86ehret GR32:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; } let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in { def EH_RETURN64 : I<0xC3, RawFrm, (outs), (ins GR64:$addr), "ret\t#eh_return, addr: $addr", [(X86ehret GR64:$addr)], IIC_RET>, Sched<[WriteJumpLd]>; } let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in { def EH_SjLj_SetJmp32 : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf), "#EH_SJLJ_SETJMP32", [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, Requires<[Not64BitMode]>; def EH_SjLj_SetJmp64 : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf), "#EH_SJLJ_SETJMP64", [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>, Requires<[In64BitMode]>; let isTerminator = 1 in { def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf), "#EH_SJLJ_LONGJMP32", [(X86eh_sjlj_longjmp addr:$buf)]>, Requires<[Not64BitMode]>; def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf), "#EH_SJLJ_LONGJMP64", [(X86eh_sjlj_longjmp addr:$buf)]>, Requires<[In64BitMode]>; } } } // SchedRW let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst), "#EH_SjLj_Setup\t$dst", []>; } //===----------------------------------------------------------------------===// // Pseudo instructions used by unwind info. // let isPseudo = 1 in { def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg), "#SEH_PushReg $reg", []>; def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), "#SEH_SaveReg $reg, $dst", []>; def SEH_SaveXMM : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), "#SEH_SaveXMM $reg, $dst", []>; def SEH_StackAlloc : I<0, Pseudo, (outs), (ins i32imm:$size), "#SEH_StackAlloc $size", []>; def SEH_SetFrame : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$offset), "#SEH_SetFrame $reg, $offset", []>; def SEH_PushFrame : I<0, Pseudo, (outs), (ins i1imm:$mode), "#SEH_PushFrame $mode", []>; def SEH_EndPrologue : I<0, Pseudo, (outs), (ins), "#SEH_EndPrologue", []>; def SEH_Epilogue : I<0, Pseudo, (outs), (ins), "#SEH_Epilogue", []>; } //===----------------------------------------------------------------------===// // Pseudo instructions used by segmented stacks. // // This is lowered into a RET instruction by MCInstLower. We need // this so that we don't have to have a MachineBasicBlock which ends // with a RET and also has successors. let isPseudo = 1 in { def MORESTACK_RET: I<0, Pseudo, (outs), (ins), "", []>; // This instruction is lowered to a RET followed by a MOV. The two // instructions are not generated on a higher level since then the // verifier sees a MachineBasicBlock ending with a non-terminator. def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>; } //===----------------------------------------------------------------------===// // Alias Instructions //===----------------------------------------------------------------------===// // Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>; // Other widths can also make use of the 32-bit xor, which may have a smaller // encoding and avoid partial register updates. def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>; def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>; def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> { let AddedComplexity = 20; } // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1, hasSideEffects = 0 in def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src), "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>; // This 64-bit pseudo-move can be used for both a 64-bit constant that is // actually the zero-extension of a 32-bit constant, and for labels in the // x86-64 small code model. def mov64imm32 : ComplexPattern; let AddedComplexity = 1 in def : Pat<(i64 mov64imm32:$src), (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>; // Use sbb to materialize carry bit. let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in { // FIXME: These are pseudo ops that should be replaced with Pat<> patterns. // However, Pat<> can't replicate the destination reg into the inputs of the // result. def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "", [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "", [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>; } // isCodeGenOnly def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C16r)>; def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C32r)>; def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C64r)>; def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C16r)>; def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C32r)>; def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C64r)>; // We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and // will be eliminated and that the sbb can be extended up to a wider type. When // this happens, it is great. However, if we are left with an 8-bit sbb and an // and, we might as well just match it as a setb. def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), (SETBr)>; // (add OP, SETB) -> (adc OP, 0) def : Pat<(add (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR8:$op), (ADC8ri GR8:$op, 0)>; def : Pat<(add (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR32:$op), (ADC32ri8 GR32:$op, 0)>; def : Pat<(add (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1), GR64:$op), (ADC64ri8 GR64:$op, 0)>; // (sub OP, SETB) -> (sbb OP, 0) def : Pat<(sub GR8:$op, (and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1)), (SBB8ri GR8:$op, 0)>; def : Pat<(sub GR32:$op, (and (i32 (X86setcc_c X86_COND_B, EFLAGS)), 1)), (SBB32ri8 GR32:$op, 0)>; def : Pat<(sub GR64:$op, (and (i64 (X86setcc_c X86_COND_B, EFLAGS)), 1)), (SBB64ri8 GR64:$op, 0)>; // (sub OP, SETCC_CARRY) -> (adc OP, 0) def : Pat<(sub GR8:$op, (i8 (X86setcc_c X86_COND_B, EFLAGS))), (ADC8ri GR8:$op, 0)>; def : Pat<(sub GR32:$op, (i32 (X86setcc_c X86_COND_B, EFLAGS))), (ADC32ri8 GR32:$op, 0)>; def : Pat<(sub GR64:$op, (i64 (X86setcc_c X86_COND_B, EFLAGS))), (ADC64ri8 GR64:$op, 0)>; //===----------------------------------------------------------------------===// // String Pseudo Instructions // let SchedRW = [WriteMicrocoded] in { let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in { def REP_MOVSB_32 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", [(X86rep_movs i8)], IIC_REP_MOVS>, REP, Requires<[Not64BitMode]>; def REP_MOVSW_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16, Requires<[Not64BitMode]>; def REP_MOVSD_32 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32, Requires<[Not64BitMode]>; } let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI], isCodeGenOnly = 1 in { def REP_MOVSB_64 : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}", [(X86rep_movs i8)], IIC_REP_MOVS>, REP, Requires<[In64BitMode]>; def REP_MOVSW_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}", [(X86rep_movs i16)], IIC_REP_MOVS>, REP, OpSize16, Requires<[In64BitMode]>; def REP_MOVSD_64 : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}", [(X86rep_movs i32)], IIC_REP_MOVS>, REP, OpSize32, Requires<[In64BitMode]>; def REP_MOVSQ_64 : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}", [(X86rep_movs i64)], IIC_REP_MOVS>, REP, Requires<[In64BitMode]>; } // FIXME: Should use "(X86rep_stos AL)" as the pattern. let Defs = [ECX,EDI], isCodeGenOnly = 1 in { let Uses = [AL,ECX,EDI] in def REP_STOSB_32 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", [(X86rep_stos i8)], IIC_REP_STOS>, REP, Requires<[Not64BitMode]>; let Uses = [AX,ECX,EDI] in def REP_STOSW_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16, Requires<[Not64BitMode]>; let Uses = [EAX,ECX,EDI] in def REP_STOSD_32 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, Requires<[Not64BitMode]>; } let Defs = [RCX,RDI], isCodeGenOnly = 1 in { let Uses = [AL,RCX,RDI] in def REP_STOSB_64 : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}", [(X86rep_stos i8)], IIC_REP_STOS>, REP, Requires<[In64BitMode]>; let Uses = [AX,RCX,RDI] in def REP_STOSW_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}", [(X86rep_stos i16)], IIC_REP_STOS>, REP, OpSize16, Requires<[In64BitMode]>; let Uses = [RAX,RCX,RDI] in def REP_STOSD_64 : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}", [(X86rep_stos i32)], IIC_REP_STOS>, REP, OpSize32, Requires<[In64BitMode]>; let Uses = [RAX,RCX,RDI] in def REP_STOSQ_64 : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}", [(X86rep_stos i64)], IIC_REP_STOS>, REP, Requires<[In64BitMode]>; } } // SchedRW //===----------------------------------------------------------------------===// // Thread Local Storage Instructions // // ELF TLS Support // All calls clobber the non-callee saved registers. ESP is marked as // a use to prevent stack-pointer assignments that appear immediately // before calls from potentially appearing dead. let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], Uses = [ESP] in { def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "# TLS_addr32", [(X86tlsaddr tls32addr:$sym)]>, Requires<[Not64BitMode]>; def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "# TLS_base_addr32", [(X86tlsbaseaddr tls32baseaddr:$sym)]>, Requires<[Not64BitMode]>; } // All calls clobber the non-callee saved registers. RSP is marked as // a use to prevent stack-pointer assignments that appear immediately // before calls from potentially appearing dead. let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], Uses = [RSP] in { def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLS_addr64", [(X86tlsaddr tls64addr:$sym)]>, Requires<[In64BitMode]>; def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLS_base_addr64", [(X86tlsbaseaddr tls64baseaddr:$sym)]>, Requires<[In64BitMode]>; } // Darwin TLS Support // For i386, the address of the thunk is passed on the stack, on return the // address of the variable is in %eax. %ecx is trashed during the function // call. All other registers are preserved. let Defs = [EAX, ECX, EFLAGS], Uses = [ESP], usesCustomInserter = 1 in def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "# TLSCall_32", [(X86TLSCall addr:$sym)]>, Requires<[Not64BitMode]>; // For x86_64, the address of the thunk is passed in %rdi, on return // the address of the variable is in %rax. All other registers are preserved. let Defs = [RAX, EFLAGS], Uses = [RSP, RDI], usesCustomInserter = 1 in def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLSCall_64", [(X86TLSCall addr:$sym)]>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// // Conditional Move Pseudo Instructions // X86 doesn't have 8-bit conditional moves. Use a customInserter to // emit control flow. An alternative to this is to mark i8 SELECT as Promote, // however that requires promoting the operands, and can induce additional // i8 register pressure. let usesCustomInserter = 1, Uses = [EFLAGS] in { def CMOV_GR8 : I<0, Pseudo, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond), "#CMOV_GR8 PSEUDO!", [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2, imm:$cond, EFLAGS))]>; let Predicates = [NoCMov] in { def CMOV_GR32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cond), "#CMOV_GR32* PSEUDO!", [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2, imm:$cond, EFLAGS))]>; def CMOV_GR16 : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cond), "#CMOV_GR16* PSEUDO!", [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>; } // Predicates = [NoCMov] // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no // SSE1. let Predicates = [FPStackf32] in def CMOV_RFP32 : I<0, Pseudo, (outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2, i8imm:$cond), "#CMOV_RFP32 PSEUDO!", [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond, EFLAGS))]>; // fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no // SSE2. let Predicates = [FPStackf64] in def CMOV_RFP64 : I<0, Pseudo, (outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2, i8imm:$cond), "#CMOV_RFP64 PSEUDO!", [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, imm:$cond, EFLAGS))]>; def CMOV_RFP80 : I<0, Pseudo, (outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2, i8imm:$cond), "#CMOV_RFP80 PSEUDO!", [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond, EFLAGS))]>; } // UsesCustomInserter = 1, Uses = [EFLAGS] //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions //===----------------------------------------------------------------------===// // FIXME: Use normal instructions and add lock prefix dynamically. // Memory barriers // TODO: Get this to fold the constant into the instruction. let isCodeGenOnly = 1, Defs = [EFLAGS] in def OR32mrLocked : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero), "or{l}\t{$zero, $dst|$dst, $zero}", [], IIC_ALU_MEM>, Requires<[Not64BitMode]>, LOCK, Sched<[WriteALULd, WriteRMW]>; let hasSideEffects = 1 in def Int_MemBarrier : I<0, Pseudo, (outs), (ins), "#MEMBARRIER", [(X86MemBarrier)]>, Sched<[WriteLoad]>; // RegOpc corresponds to the mr version of the instruction // ImmOpc corresponds to the mi version of the instruction // ImmOpc8 corresponds to the mi8 version of the instruction // ImmMod corresponds to the instruction format of the mi and mi8 versions multiclass LOCK_ArithBinOp RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, Format ImmMod, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 }, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), !strconcat(mnemonic, "{b}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_NONMEM>, LOCK; def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), !strconcat(mnemonic, "{w}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_NONMEM>, OpSize16, LOCK; def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), !strconcat(mnemonic, "{l}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_NONMEM>, OpSize32, LOCK; def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4}, RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 }, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), !strconcat(mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_NONMEM>, LOCK; def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 }, ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2), !strconcat(mnemonic, "{b}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_MEM>, LOCK; def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2), !strconcat(mnemonic, "{w}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_MEM>, OpSize16, LOCK; def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2), !strconcat(mnemonic, "{l}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_MEM>, OpSize32, LOCK; def NAME#64mi32 : RIi32S<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4}, ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 }, ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2), !strconcat(mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_MEM>, LOCK; def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2), !strconcat(mnemonic, "{w}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_MEM>, OpSize16, LOCK; def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2), !strconcat(mnemonic, "{l}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_MEM>, OpSize32, LOCK; def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4}, ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 }, ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2), !strconcat(mnemonic, "{q}\t", "{$src2, $dst|$dst, $src2}"), [], IIC_ALU_MEM>, LOCK; } } defm LOCK_ADD : LOCK_ArithBinOp<0x00, 0x80, 0x83, MRM0m, "add">; defm LOCK_SUB : LOCK_ArithBinOp<0x28, 0x80, 0x83, MRM5m, "sub">; defm LOCK_OR : LOCK_ArithBinOp<0x08, 0x80, 0x83, MRM1m, "or">; defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">; defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">; // Optimized codegen when the non-memory output is not used. multiclass LOCK_ArithUnOp Opc8, bits<8> Opc, Format Form, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { def NAME#8m : I, LOCK; def NAME#16m : I, OpSize16, LOCK; def NAME#32m : I, OpSize32, LOCK; def NAME#64m : RI, LOCK; } } defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">; defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">; // Atomic compare and swap. multiclass LCMPXCHG_UnOp Opc, Format Form, string mnemonic, SDPatternOperator frag, X86MemOperand x86memop, InstrItinClass itin> { let isCodeGenOnly = 1 in { def NAME : I, TB, LOCK; } } multiclass LCMPXCHG_BinOp Opc8, bits<8> Opc, Format Form, string mnemonic, SDPatternOperator frag, InstrItinClass itin8, InstrItinClass itin> { let isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { let Defs = [AL, EFLAGS], Uses = [AL] in def NAME#8 : I, TB, LOCK; let Defs = [AX, EFLAGS], Uses = [AX] in def NAME#16 : I, TB, OpSize16, LOCK; let Defs = [EAX, EFLAGS], Uses = [EAX] in def NAME#32 : I, TB, OpSize32, LOCK; let Defs = [RAX, EFLAGS], Uses = [RAX] in def NAME#64 : RI, TB, LOCK; } } let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX], SchedRW = [WriteALULd, WriteRMW] in { defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem, IIC_CMPX_LOCK_8B>; } let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX], Predicates = [HasCmpxchg16b], SchedRW = [WriteALULd, WriteRMW] in { defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b", X86cas16, i128mem, IIC_CMPX_LOCK_16B>, REX_W; } defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg", X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>; // Atomic exchange and add multiclass ATOMIC_LOAD_BINOP opc8, bits<8> opc, string mnemonic, string frag, InstrItinClass itin8, InstrItinClass itin> { let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { def NAME#8 : I(frag # "_8") addr:$ptr, GR8:$val))], itin8>; def NAME#16 : I(frag # "_16") addr:$ptr, GR16:$val))], itin>, OpSize16; def NAME#32 : I(frag # "_32") addr:$ptr, GR32:$val))], itin>, OpSize32; def NAME#64 : RI(frag # "_64") addr:$ptr, GR64:$val))], itin>; } } defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add", IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>, TB, LOCK; /* The following multiclass tries to make sure that in code like * x.store (immediate op x.load(acquire), release) * an operation directly on memory is generated instead of wasting a register. * It is not automatic as atomic_store/load are only lowered to MOV instructions * extremely late to prevent them from being accidentally reordered in the backend * (see below the RELEASE_MOV* / ACQUIRE_MOV* pseudo-instructions) */ multiclass RELEASE_BINOP_MI { def NAME#8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), "#RELEASE_BINOP PSEUDO!", [(atomic_store_8 addr:$dst, (!cast(op) (atomic_load_8 addr:$dst), (i8 imm:$src)))]>; // NAME#16 is not generated as 16-bit arithmetic instructions are considered // costly and avoided as far as possible by this backend anyway def NAME#32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), "#RELEASE_BINOP PSEUDO!", [(atomic_store_32 addr:$dst, (!cast(op) (atomic_load_32 addr:$dst), (i32 imm:$src)))]>; def NAME#64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), "#RELEASE_BINOP PSEUDO!", [(atomic_store_64 addr:$dst, (!cast(op) (atomic_load_64 addr:$dst), (i64immSExt32:$src)))]>; } defm RELEASE_ADD : RELEASE_BINOP_MI<"add">; defm RELEASE_AND : RELEASE_BINOP_MI<"and">; defm RELEASE_OR : RELEASE_BINOP_MI<"or">; defm RELEASE_XOR : RELEASE_BINOP_MI<"xor">; // Note: we don't deal with sub, because substractions of constants are // optimized into additions before this code can run multiclass RELEASE_UNOP { def NAME#8m : I<0, Pseudo, (outs), (ins i8mem:$dst), "#RELEASE_UNOP PSEUDO!", [(atomic_store_8 addr:$dst, dag8)]>; def NAME#16m : I<0, Pseudo, (outs), (ins i16mem:$dst), "#RELEASE_UNOP PSEUDO!", [(atomic_store_16 addr:$dst, dag16)]>; def NAME#32m : I<0, Pseudo, (outs), (ins i32mem:$dst), "#RELEASE_UNOP PSEUDO!", [(atomic_store_32 addr:$dst, dag32)]>; def NAME#64m : I<0, Pseudo, (outs), (ins i64mem:$dst), "#RELEASE_UNOP PSEUDO!", [(atomic_store_64 addr:$dst, dag64)]>; } defm RELEASE_INC : RELEASE_UNOP< (add (atomic_load_8 addr:$dst), (i8 1)), (add (atomic_load_16 addr:$dst), (i16 1)), (add (atomic_load_32 addr:$dst), (i32 1)), (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; defm RELEASE_DEC : RELEASE_UNOP< (add (atomic_load_8 addr:$dst), (i8 -1)), (add (atomic_load_16 addr:$dst), (i16 -1)), (add (atomic_load_32 addr:$dst), (i32 -1)), (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; /* TODO: These don't work because the type inference of TableGen fails. TODO: find a way to fix it. defm RELEASE_NEG : RELEASE_UNOP< (ineg (atomic_load_8 addr:$dst)), (ineg (atomic_load_16 addr:$dst)), (ineg (atomic_load_32 addr:$dst)), (ineg (atomic_load_64 addr:$dst))>; defm RELEASE_NOT : RELEASE_UNOP< (not (atomic_load_8 addr:$dst)), (not (atomic_load_16 addr:$dst)), (not (atomic_load_32 addr:$dst)), (not (atomic_load_64 addr:$dst))>; */ def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), "#RELEASE_MOV PSEUDO !", [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), "#RELEASE_MOV PSEUDO !", [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), "#RELEASE_MOV PSEUDO !", [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), "#RELEASE_MOV PSEUDO !", [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), "#RELEASE_MOV PSEUDO!", [(atomic_store_8 addr:$dst, GR8 :$src)]>; def RELEASE_MOV16mr : I<0, Pseudo, (outs), (ins i16mem:$dst, GR16:$src), "#RELEASE_MOV PSEUDO!", [(atomic_store_16 addr:$dst, GR16:$src)]>; def RELEASE_MOV32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, GR32:$src), "#RELEASE_MOV PSEUDO!", [(atomic_store_32 addr:$dst, GR32:$src)]>; def RELEASE_MOV64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, GR64:$src), "#RELEASE_MOV PSEUDO!", [(atomic_store_64 addr:$dst, GR64:$src)]>; def ACQUIRE_MOV8rm : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src), "#ACQUIRE_MOV PSEUDO!", [(set GR8:$dst, (atomic_load_8 addr:$src))]>; def ACQUIRE_MOV16rm : I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$src), "#ACQUIRE_MOV PSEUDO!", [(set GR16:$dst, (atomic_load_16 addr:$src))]>; def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), "#ACQUIRE_MOV PSEUDO!", [(set GR32:$dst, (atomic_load_32 addr:$src))]>; def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), "#ACQUIRE_MOV PSEUDO!", [(set GR64:$dst, (atomic_load_64 addr:$src))]>; //===----------------------------------------------------------------------===// // Conditional Move Pseudo Instructions. //===----------------------------------------------------------------------===// // CMOV* - Used to implement the SSE SELECT DAG operation. Expanded after // instruction selection into a branch sequence. let Uses = [EFLAGS], usesCustomInserter = 1 in { def CMOV_FR32 : I<0, Pseudo, (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond), "#CMOV_FR32 PSEUDO!", [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond, EFLAGS))]>; def CMOV_FR64 : I<0, Pseudo, (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond), "#CMOV_FR64 PSEUDO!", [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond, EFLAGS))]>; def CMOV_V4F32 : I<0, Pseudo, (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), "#CMOV_V4F32 PSEUDO!", [(set VR128:$dst, (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)))]>; def CMOV_V2F64 : I<0, Pseudo, (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), "#CMOV_V2F64 PSEUDO!", [(set VR128:$dst, (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)))]>; def CMOV_V2I64 : I<0, Pseudo, (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond), "#CMOV_V2I64 PSEUDO!", [(set VR128:$dst, (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond, EFLAGS)))]>; def CMOV_V8F32 : I<0, Pseudo, (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), "#CMOV_V8F32 PSEUDO!", [(set VR256:$dst, (v8f32 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)))]>; def CMOV_V4F64 : I<0, Pseudo, (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), "#CMOV_V4F64 PSEUDO!", [(set VR256:$dst, (v4f64 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)))]>; def CMOV_V4I64 : I<0, Pseudo, (outs VR256:$dst), (ins VR256:$t, VR256:$f, i8imm:$cond), "#CMOV_V4I64 PSEUDO!", [(set VR256:$dst, (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond, EFLAGS)))]>; def CMOV_V8I64 : I<0, Pseudo, (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), "#CMOV_V8I64 PSEUDO!", [(set VR512:$dst, (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)))]>; def CMOV_V8F64 : I<0, Pseudo, (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), "#CMOV_V8F64 PSEUDO!", [(set VR512:$dst, (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)))]>; def CMOV_V16F32 : I<0, Pseudo, (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond), "#CMOV_V16F32 PSEUDO!", [(set VR512:$dst, (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond, EFLAGS)))]>; } //===----------------------------------------------------------------------===// // DAG Pattern Matching Rules //===----------------------------------------------------------------------===// // ConstantPool GlobalAddress, ExternalSymbol, and JumpTable def : Pat<(i32 (X86Wrapper tconstpool :$dst)), (MOV32ri tconstpool :$dst)>; def : Pat<(i32 (X86Wrapper tjumptable :$dst)), (MOV32ri tjumptable :$dst)>; def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>; def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>; def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>; def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)), (ADD32ri GR32:$src1, tconstpool:$src2)>; def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)), (ADD32ri GR32:$src1, tjumptable:$src2)>; def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)), (ADD32ri GR32:$src1, tglobaladdr:$src2)>; def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)), (ADD32ri GR32:$src1, texternalsym:$src2)>; def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)), (ADD32ri GR32:$src1, tblockaddress:$src2)>; def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst), (MOV32mi addr:$dst, tglobaladdr:$src)>; def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst), (MOV32mi addr:$dst, texternalsym:$src)>; def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst), (MOV32mi addr:$dst, tblockaddress:$src)>; // ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small // code model mode, should use 'movabs'. FIXME: This is really a hack, the // 'movabs' predicate should handle this sort of thing. def : Pat<(i64 (X86Wrapper tconstpool :$dst)), (MOV64ri tconstpool :$dst)>, Requires<[FarData]>; def : Pat<(i64 (X86Wrapper tjumptable :$dst)), (MOV64ri tjumptable :$dst)>, Requires<[FarData]>; def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>; def : Pat<(i64 (X86Wrapper texternalsym:$dst)), (MOV64ri texternalsym:$dst)>, Requires<[FarData]>; def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>; // In kernel code model, we can get the address of a label // into a register with 'movq'. FIXME: This is a hack, the 'imm' predicate of // the MOV64ri32 should accept these. def : Pat<(i64 (X86Wrapper tconstpool :$dst)), (MOV64ri32 tconstpool :$dst)>, Requires<[KernelCode]>; def : Pat<(i64 (X86Wrapper tjumptable :$dst)), (MOV64ri32 tjumptable :$dst)>, Requires<[KernelCode]>; def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)), (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>; def : Pat<(i64 (X86Wrapper texternalsym:$dst)), (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>; def : Pat<(i64 (X86Wrapper tblockaddress:$dst)), (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>; // If we have small model and -static mode, it is safe to store global addresses // directly as immediates. FIXME: This is really a hack, the 'imm' predicate // for MOV64mi32 should handle this sort of thing. def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst), (MOV64mi32 addr:$dst, tconstpool:$src)>, Requires<[NearData, IsStatic]>; def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst), (MOV64mi32 addr:$dst, tjumptable:$src)>, Requires<[NearData, IsStatic]>; def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst), (MOV64mi32 addr:$dst, tglobaladdr:$src)>, Requires<[NearData, IsStatic]>; def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst), (MOV64mi32 addr:$dst, texternalsym:$src)>, Requires<[NearData, IsStatic]>; def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst), (MOV64mi32 addr:$dst, tblockaddress:$src)>, Requires<[NearData, IsStatic]>; def : Pat<(i32 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV32ri texternalsym:$dst)>; def : Pat<(i64 (X86RecoverFrameAlloc texternalsym:$dst)), (MOV64ri texternalsym:$dst)>; // Calls // tls has some funny stuff here... // This corresponds to movabs $foo@tpoff, %rax def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)), (MOV64ri32 tglobaltlsaddr :$dst)>; // This corresponds to add $foo@tpoff, %rax def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)), (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>; // Direct PC relative function call for small code model. 32-bit displacement // sign extended to 64-bit. def : Pat<(X86call (i64 tglobaladdr:$dst)), (CALL64pcrel32 tglobaladdr:$dst)>; def : Pat<(X86call (i64 texternalsym:$dst)), (CALL64pcrel32 texternalsym:$dst)>; // Tailcall stuff. The TCRETURN instructions execute after the epilog, so they // can never use callee-saved registers. That is the purpose of the GR64_TC // register classes. // // The only volatile register that is never used by the calling convention is // %r11. This happens when calling a vararg function with 6 arguments. // // Match an X86tcret that uses less than 7 volatile registers. def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off), (X86tcret node:$ptr, node:$off), [{ // X86tcret args: (*chain, ptr, imm, regs..., glue) unsigned NumRegs = 0; for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i) if (isa(N->getOperand(i)) && ++NumRegs > 6) return false; return true; }]>; def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>, Requires<[Not64BitMode]>; // FIXME: This is disabled for 32-bit PIC mode because the global base // register which is part of the address mode may be assigned a // callee-saved register. def : Pat<(X86tcret (load addr:$dst), imm:$off), (TCRETURNmi addr:$dst, imm:$off)>, Requires<[Not64BitMode, IsNotPIC]>; def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off), (TCRETURNdi texternalsym:$dst, imm:$off)>, Requires<[Not64BitMode]>; def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off), (TCRETURNdi texternalsym:$dst, imm:$off)>, Requires<[Not64BitMode]>; def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off), (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>, Requires<[In64BitMode]>; // Don't fold loads into X86tcret requiring more than 6 regs. // There wouldn't be enough scratch registers for base+index. def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off), (TCRETURNmi64 addr:$dst, imm:$off)>, Requires<[In64BitMode]>; def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off), (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>, Requires<[In64BitMode]>; def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off), (TCRETURNdi64 texternalsym:$dst, imm:$off)>, Requires<[In64BitMode]>; // Normal calls, with various flavors of addresses. def : Pat<(X86call (i32 tglobaladdr:$dst)), (CALLpcrel32 tglobaladdr:$dst)>; def : Pat<(X86call (i32 texternalsym:$dst)), (CALLpcrel32 texternalsym:$dst)>; def : Pat<(X86call (i32 imm:$dst)), (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>; // Comparisons. // TEST R,R is smaller than CMP R,0 def : Pat<(X86cmp GR8:$src1, 0), (TEST8rr GR8:$src1, GR8:$src1)>; def : Pat<(X86cmp GR16:$src1, 0), (TEST16rr GR16:$src1, GR16:$src1)>; def : Pat<(X86cmp GR32:$src1, 0), (TEST32rr GR32:$src1, GR32:$src1)>; def : Pat<(X86cmp GR64:$src1, 0), (TEST64rr GR64:$src1, GR64:$src1)>; // Conditional moves with folded loads with operands swapped and conditions // inverted. multiclass CMOVmr { let Predicates = [HasCMov] in { def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS), (Inst16 GR16:$src2, addr:$src1)>; def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS), (Inst32 GR32:$src2, addr:$src1)>; def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS), (Inst64 GR64:$src2, addr:$src1)>; } } defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; defm : CMOVmr; // zextload bool -> zextload byte def : Pat<(zextloadi8i1 addr:$src), (MOV8rm addr:$src)>; def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; def : Pat<(zextloadi64i1 addr:$src), (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; // extload bool -> extload byte // When extloading from 16-bit and smaller memory locations into 64-bit // registers, use zero-extending loads so that the entire 64-bit register is // defined, avoiding partial-register updates. def : Pat<(extloadi8i1 addr:$src), (MOV8rm addr:$src)>; def : Pat<(extloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>; def : Pat<(extloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>; def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>; def : Pat<(extloadi32i8 addr:$src), (MOVZX32rm8 addr:$src)>; def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>; // For other extloads, use subregs, since the high contents of the register are // defined after an extload. def : Pat<(extloadi64i1 addr:$src), (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; def : Pat<(extloadi64i8 addr:$src), (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>; def : Pat<(extloadi64i16 addr:$src), (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>; def : Pat<(extloadi64i32 addr:$src), (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>; // anyext. Define these to do an explicit zero-extend to // avoid partial-register updates. def : Pat<(i16 (anyext GR8 :$src)), (EXTRACT_SUBREG (MOVZX32rr8 GR8 :$src), sub_16bit)>; def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8 GR8 :$src)>; // Except for i16 -> i32 since isel expect i16 ops to be promoted to i32. def : Pat<(i32 (anyext GR16:$src)), (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>; def : Pat<(i64 (anyext GR8 :$src)), (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8 :$src), sub_32bit)>; def : Pat<(i64 (anyext GR16:$src)), (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>; def : Pat<(i64 (anyext GR32:$src)), (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; // Any instruction that defines a 32-bit result leaves the high half of the // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may // be copying from a truncate. And x86's cmov doesn't do anything if the // condition is false. But any other 32-bit operation will zero-extend // up to 64 bits. def def32 : PatLeaf<(i32 GR32:$src), [{ return N->getOpcode() != ISD::TRUNCATE && N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && N->getOpcode() != ISD::CopyFromReg && N->getOpcode() != ISD::AssertSext && N->getOpcode() != X86ISD::CMOV; }]>; // In the case of a 32-bit def that is known to implicitly zero-extend, // we can use a SUBREG_TO_REG. def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; //===----------------------------------------------------------------------===// // Pattern match OR as ADD //===----------------------------------------------------------------------===// // If safe, we prefer to pattern match OR as ADD at isel time. ADD can be // 3-addressified into an LEA instruction to avoid copies. However, we also // want to finally emit these instructions as an or at the end of the code // generator to make the generated code easier to read. To do this, we select // into "disjoint bits" pseudo ops. // Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero. def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{ if (ConstantSDNode *CN = dyn_cast(N->getOperand(1))) return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue()); APInt KnownZero0, KnownOne0; CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0); APInt KnownZero1, KnownOne1; CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0); return (~KnownZero0 & ~KnownZero1) == 0; }]>; // (or x1, x2) -> (add x1, x2) if two operands are known not to share bits. // Try this before the selecting to OR. let AddedComplexity = 5, SchedRW = [WriteALU] in { let isConvertibleToThreeAddress = 1, Constraints = "$src1 = $dst", Defs = [EFLAGS] in { let isCommutable = 1 in { def ADD16rr_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "", // orw/addw REG, REG [(set GR16:$dst, (or_is_add GR16:$src1, GR16:$src2))]>; def ADD32rr_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "", // orl/addl REG, REG [(set GR32:$dst, (or_is_add GR32:$src1, GR32:$src2))]>; def ADD64rr_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), "", // orq/addq REG, REG [(set GR64:$dst, (or_is_add GR64:$src1, GR64:$src2))]>; } // isCommutable // NOTE: These are order specific, we want the ri8 forms to be listed // first so that they are slightly preferred to the ri forms. def ADD16ri8_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "", // orw/addw REG, imm8 [(set GR16:$dst,(or_is_add GR16:$src1,i16immSExt8:$src2))]>; def ADD16ri_DB : I<0, Pseudo, (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), "", // orw/addw REG, imm [(set GR16:$dst, (or_is_add GR16:$src1, imm:$src2))]>; def ADD32ri8_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "", // orl/addl REG, imm8 [(set GR32:$dst,(or_is_add GR32:$src1,i32immSExt8:$src2))]>; def ADD32ri_DB : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), "", // orl/addl REG, imm [(set GR32:$dst, (or_is_add GR32:$src1, imm:$src2))]>; def ADD64ri8_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "", // orq/addq REG, imm8 [(set GR64:$dst, (or_is_add GR64:$src1, i64immSExt8:$src2))]>; def ADD64ri32_DB : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), "", // orq/addq REG, imm [(set GR64:$dst, (or_is_add GR64:$src1, i64immSExt32:$src2))]>; } } // AddedComplexity, SchedRW //===----------------------------------------------------------------------===// // Some peepholes //===----------------------------------------------------------------------===// // Odd encoding trick: -128 fits into an 8-bit immediate field while // +128 doesn't, so in this special case use a sub instead of an add. def : Pat<(add GR16:$src1, 128), (SUB16ri8 GR16:$src1, -128)>; def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst), (SUB16mi8 addr:$dst, -128)>; def : Pat<(add GR32:$src1, 128), (SUB32ri8 GR32:$src1, -128)>; def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst), (SUB32mi8 addr:$dst, -128)>; def : Pat<(add GR64:$src1, 128), (SUB64ri8 GR64:$src1, -128)>; def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst), (SUB64mi8 addr:$dst, -128)>; // The same trick applies for 32-bit immediate fields in 64-bit // instructions. def : Pat<(add GR64:$src1, 0x0000000080000000), (SUB64ri32 GR64:$src1, 0xffffffff80000000)>; def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst), (SUB64mi32 addr:$dst, 0xffffffff80000000)>; // To avoid needing to materialize an immediate in a register, use a 32-bit and // with implicit zero-extension instead of a 64-bit and if the immediate has at // least 32 bits of leading zeros. If in addition the last 32 bits can be // represented with a sign extension of a 8 bit constant, use that. def : Pat<(and GR64:$src, i64immZExt32SExt8:$imm), (SUBREG_TO_REG (i64 0), (AND32ri8 (EXTRACT_SUBREG GR64:$src, sub_32bit), (i32 (GetLo8XForm imm:$imm))), sub_32bit)>; def : Pat<(and GR64:$src, i64immZExt32:$imm), (SUBREG_TO_REG (i64 0), (AND32ri (EXTRACT_SUBREG GR64:$src, sub_32bit), (i32 (GetLo32XForm imm:$imm))), sub_32bit)>; // r & (2^16-1) ==> movz def : Pat<(and GR32:$src1, 0xffff), (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>; // r & (2^8-1) ==> movz def : Pat<(and GR32:$src1, 0xff), (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1, GR32_ABCD)), sub_8bit))>, Requires<[Not64BitMode]>; // r & (2^8-1) ==> movz def : Pat<(and GR16:$src1, 0xff), (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)), sub_16bit)>, Requires<[Not64BitMode]>; // r & (2^32-1) ==> movz def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), (SUBREG_TO_REG (i64 0), (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)), sub_32bit)>; // r & (2^16-1) ==> movz def : Pat<(and GR64:$src, 0xffff), (SUBREG_TO_REG (i64 0), (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))), sub_32bit)>; // r & (2^8-1) ==> movz def : Pat<(and GR64:$src, 0xff), (SUBREG_TO_REG (i64 0), (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))), sub_32bit)>; // r & (2^8-1) ==> movz def : Pat<(and GR32:$src1, 0xff), (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>, Requires<[In64BitMode]>; // r & (2^8-1) ==> movz def : Pat<(and GR16:$src1, 0xff), (EXTRACT_SUBREG (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, Requires<[In64BitMode]>; // sext_inreg patterns def : Pat<(sext_inreg GR32:$src, i16), (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>; def : Pat<(sext_inreg GR32:$src, i8), (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), sub_8bit))>, Requires<[Not64BitMode]>; def : Pat<(sext_inreg GR16:$src, i8), (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))), sub_16bit)>, Requires<[Not64BitMode]>; def : Pat<(sext_inreg GR64:$src, i32), (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; def : Pat<(sext_inreg GR64:$src, i16), (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>; def : Pat<(sext_inreg GR64:$src, i8), (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>; def : Pat<(sext_inreg GR32:$src, i8), (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>, Requires<[In64BitMode]>; def : Pat<(sext_inreg GR16:$src, i8), (EXTRACT_SUBREG (MOVSX32rr8 (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>, Requires<[In64BitMode]>; // sext, sext_load, zext, zext_load def: Pat<(i16 (sext GR8:$src)), (EXTRACT_SUBREG (MOVSX32rr8 GR8:$src), sub_16bit)>; def: Pat<(sextloadi16i8 addr:$src), (EXTRACT_SUBREG (MOVSX32rm8 addr:$src), sub_16bit)>; def: Pat<(i16 (zext GR8:$src)), (EXTRACT_SUBREG (MOVZX32rr8 GR8:$src), sub_16bit)>; def: Pat<(zextloadi16i8 addr:$src), (EXTRACT_SUBREG (MOVZX32rm8 addr:$src), sub_16bit)>; // trunc patterns def : Pat<(i16 (trunc GR32:$src)), (EXTRACT_SUBREG GR32:$src, sub_16bit)>; def : Pat<(i8 (trunc GR32:$src)), (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), sub_8bit)>, Requires<[Not64BitMode]>; def : Pat<(i8 (trunc GR16:$src)), (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit)>, Requires<[Not64BitMode]>; def : Pat<(i32 (trunc GR64:$src)), (EXTRACT_SUBREG GR64:$src, sub_32bit)>; def : Pat<(i16 (trunc GR64:$src)), (EXTRACT_SUBREG GR64:$src, sub_16bit)>; def : Pat<(i8 (trunc GR64:$src)), (EXTRACT_SUBREG GR64:$src, sub_8bit)>; def : Pat<(i8 (trunc GR32:$src)), (EXTRACT_SUBREG GR32:$src, sub_8bit)>, Requires<[In64BitMode]>; def : Pat<(i8 (trunc GR16:$src)), (EXTRACT_SUBREG GR16:$src, sub_8bit)>, Requires<[In64BitMode]>; // h-register tricks def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))), (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit_hi)>, Requires<[Not64BitMode]>; def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))), (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), sub_8bit_hi)>, Requires<[Not64BitMode]>; def : Pat<(srl GR16:$src, (i8 8)), (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit_hi)), sub_16bit)>, Requires<[Not64BitMode]>; def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit_hi))>, Requires<[Not64BitMode]>; def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit_hi))>, Requires<[Not64BitMode]>; def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), sub_8bit_hi))>, Requires<[Not64BitMode]>; def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), sub_8bit_hi))>, Requires<[Not64BitMode]>; // h-register tricks. // For now, be conservative on x86-64 and use an h-register extract only if the // value is immediately zero-extended or stored, which are somewhat common // cases. This uses a bunch of code to prevent a register requiring a REX prefix // from being allocated in the same instruction as the h register, as there's // currently no way to describe this requirement to the register allocator. // h-register extract and zero-extend. def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)), (SUBREG_TO_REG (i64 0), (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), sub_8bit_hi)), sub_32bit)>; def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), sub_8bit_hi))>, Requires<[In64BitMode]>; def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), sub_8bit_hi))>, Requires<[In64BitMode]>; def : Pat<(srl GR16:$src, (i8 8)), (EXTRACT_SUBREG (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit_hi)), sub_16bit)>, Requires<[In64BitMode]>; def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit_hi))>, Requires<[In64BitMode]>; def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit_hi))>, Requires<[In64BitMode]>; def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))), (SUBREG_TO_REG (i64 0), (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit_hi)), sub_32bit)>; def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))), (SUBREG_TO_REG (i64 0), (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit_hi)), sub_32bit)>; // h-register extract and store. def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst), (MOV8mr_NOREX addr:$dst, (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), sub_8bit_hi))>; def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst), (MOV8mr_NOREX addr:$dst, (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), sub_8bit_hi))>, Requires<[In64BitMode]>; def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), (MOV8mr_NOREX addr:$dst, (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit_hi))>, Requires<[In64BitMode]>; // (shl x, 1) ==> (add x, x) // Note that if x is undef (immediate or otherwise), we could theoretically // end up with the two uses of x getting different values, producing a result // where the least significant bit is not 0. However, the probability of this // happening is considered low enough that this is officially not a // "real problem". def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr GR8 :$src1, GR8 :$src1)>; def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; // Helper imms that check if a mask doesn't change significant shift bits. def immShift32 : ImmLeaf= 5; }]>; def immShift64 : ImmLeaf= 6; }]>; // Shift amount is implicitly masked. multiclass MaskedShiftAmountPats { // (shift x (and y, 31)) ==> (shift x, y) def : Pat<(frag GR8:$src1, (and CL, immShift32)), (!cast(name # "8rCL") GR8:$src1)>; def : Pat<(frag GR16:$src1, (and CL, immShift32)), (!cast(name # "16rCL") GR16:$src1)>; def : Pat<(frag GR32:$src1, (and CL, immShift32)), (!cast(name # "32rCL") GR32:$src1)>; def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift32)), addr:$dst), (!cast(name # "8mCL") addr:$dst)>; def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift32)), addr:$dst), (!cast(name # "16mCL") addr:$dst)>; def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), (!cast(name # "32mCL") addr:$dst)>; // (shift x (and y, 63)) ==> (shift x, y) def : Pat<(frag GR64:$src1, (and CL, immShift64)), (!cast(name # "64rCL") GR64:$src1)>; def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst), (!cast(name # "64mCL") addr:$dst)>; } defm : MaskedShiftAmountPats; defm : MaskedShiftAmountPats; defm : MaskedShiftAmountPats; defm : MaskedShiftAmountPats; defm : MaskedShiftAmountPats; // (anyext (setcc_carry)) -> (setcc_carry) def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C16r)>; def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C32r)>; def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C32r)>; //===----------------------------------------------------------------------===// // EFLAGS-defining Patterns //===----------------------------------------------------------------------===// // add reg, reg def : Pat<(add GR8 :$src1, GR8 :$src2), (ADD8rr GR8 :$src1, GR8 :$src2)>; def : Pat<(add GR16:$src1, GR16:$src2), (ADD16rr GR16:$src1, GR16:$src2)>; def : Pat<(add GR32:$src1, GR32:$src2), (ADD32rr GR32:$src1, GR32:$src2)>; // add reg, mem def : Pat<(add GR8:$src1, (loadi8 addr:$src2)), (ADD8rm GR8:$src1, addr:$src2)>; def : Pat<(add GR16:$src1, (loadi16 addr:$src2)), (ADD16rm GR16:$src1, addr:$src2)>; def : Pat<(add GR32:$src1, (loadi32 addr:$src2)), (ADD32rm GR32:$src1, addr:$src2)>; // add reg, imm def : Pat<(add GR8 :$src1, imm:$src2), (ADD8ri GR8:$src1 , imm:$src2)>; def : Pat<(add GR16:$src1, imm:$src2), (ADD16ri GR16:$src1, imm:$src2)>; def : Pat<(add GR32:$src1, imm:$src2), (ADD32ri GR32:$src1, imm:$src2)>; def : Pat<(add GR16:$src1, i16immSExt8:$src2), (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>; def : Pat<(add GR32:$src1, i32immSExt8:$src2), (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>; // sub reg, reg def : Pat<(sub GR8 :$src1, GR8 :$src2), (SUB8rr GR8 :$src1, GR8 :$src2)>; def : Pat<(sub GR16:$src1, GR16:$src2), (SUB16rr GR16:$src1, GR16:$src2)>; def : Pat<(sub GR32:$src1, GR32:$src2), (SUB32rr GR32:$src1, GR32:$src2)>; // sub reg, mem def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)), (SUB8rm GR8:$src1, addr:$src2)>; def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)), (SUB16rm GR16:$src1, addr:$src2)>; def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)), (SUB32rm GR32:$src1, addr:$src2)>; // sub reg, imm def : Pat<(sub GR8:$src1, imm:$src2), (SUB8ri GR8:$src1, imm:$src2)>; def : Pat<(sub GR16:$src1, imm:$src2), (SUB16ri GR16:$src1, imm:$src2)>; def : Pat<(sub GR32:$src1, imm:$src2), (SUB32ri GR32:$src1, imm:$src2)>; def : Pat<(sub GR16:$src1, i16immSExt8:$src2), (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>; def : Pat<(sub GR32:$src1, i32immSExt8:$src2), (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>; // sub 0, reg def : Pat<(X86sub_flag 0, GR8 :$src), (NEG8r GR8 :$src)>; def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>; def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>; def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>; // mul reg, reg def : Pat<(mul GR16:$src1, GR16:$src2), (IMUL16rr GR16:$src1, GR16:$src2)>; def : Pat<(mul GR32:$src1, GR32:$src2), (IMUL32rr GR32:$src1, GR32:$src2)>; // mul reg, mem def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)), (IMUL16rm GR16:$src1, addr:$src2)>; def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)), (IMUL32rm GR32:$src1, addr:$src2)>; // mul reg, imm def : Pat<(mul GR16:$src1, imm:$src2), (IMUL16rri GR16:$src1, imm:$src2)>; def : Pat<(mul GR32:$src1, imm:$src2), (IMUL32rri GR32:$src1, imm:$src2)>; def : Pat<(mul GR16:$src1, i16immSExt8:$src2), (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>; def : Pat<(mul GR32:$src1, i32immSExt8:$src2), (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>; // reg = mul mem, imm def : Pat<(mul (loadi16 addr:$src1), imm:$src2), (IMUL16rmi addr:$src1, imm:$src2)>; def : Pat<(mul (loadi32 addr:$src1), imm:$src2), (IMUL32rmi addr:$src1, imm:$src2)>; def : Pat<(mul (loadi16 addr:$src1), i16immSExt8:$src2), (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>; def : Pat<(mul (loadi32 addr:$src1), i32immSExt8:$src2), (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>; // Patterns for nodes that do not produce flags, for instructions that do. // addition def : Pat<(add GR64:$src1, GR64:$src2), (ADD64rr GR64:$src1, GR64:$src2)>; def : Pat<(add GR64:$src1, i64immSExt8:$src2), (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>; def : Pat<(add GR64:$src1, i64immSExt32:$src2), (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>; def : Pat<(add GR64:$src1, (loadi64 addr:$src2)), (ADD64rm GR64:$src1, addr:$src2)>; // subtraction def : Pat<(sub GR64:$src1, GR64:$src2), (SUB64rr GR64:$src1, GR64:$src2)>; def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)), (SUB64rm GR64:$src1, addr:$src2)>; def : Pat<(sub GR64:$src1, i64immSExt8:$src2), (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>; def : Pat<(sub GR64:$src1, i64immSExt32:$src2), (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>; // Multiply def : Pat<(mul GR64:$src1, GR64:$src2), (IMUL64rr GR64:$src1, GR64:$src2)>; def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)), (IMUL64rm GR64:$src1, addr:$src2)>; def : Pat<(mul GR64:$src1, i64immSExt8:$src2), (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>; def : Pat<(mul GR64:$src1, i64immSExt32:$src2), (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>; def : Pat<(mul (loadi64 addr:$src1), i64immSExt8:$src2), (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>; def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>; // Increment/Decrement reg. // Do not make INC/DEC if it is slow let Predicates = [NotSlowIncDec] in { def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>; def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>; def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>; def : Pat<(add GR64:$src, 1), (INC64r GR64:$src)>; def : Pat<(add GR8:$src, -1), (DEC8r GR8:$src)>; def : Pat<(add GR16:$src, -1), (DEC16r GR16:$src)>; def : Pat<(add GR32:$src, -1), (DEC32r GR32:$src)>; def : Pat<(add GR64:$src, -1), (DEC64r GR64:$src)>; } // or reg/reg. def : Pat<(or GR8 :$src1, GR8 :$src2), (OR8rr GR8 :$src1, GR8 :$src2)>; def : Pat<(or GR16:$src1, GR16:$src2), (OR16rr GR16:$src1, GR16:$src2)>; def : Pat<(or GR32:$src1, GR32:$src2), (OR32rr GR32:$src1, GR32:$src2)>; def : Pat<(or GR64:$src1, GR64:$src2), (OR64rr GR64:$src1, GR64:$src2)>; // or reg/mem def : Pat<(or GR8:$src1, (loadi8 addr:$src2)), (OR8rm GR8:$src1, addr:$src2)>; def : Pat<(or GR16:$src1, (loadi16 addr:$src2)), (OR16rm GR16:$src1, addr:$src2)>; def : Pat<(or GR32:$src1, (loadi32 addr:$src2)), (OR32rm GR32:$src1, addr:$src2)>; def : Pat<(or GR64:$src1, (loadi64 addr:$src2)), (OR64rm GR64:$src1, addr:$src2)>; // or reg/imm def : Pat<(or GR8:$src1 , imm:$src2), (OR8ri GR8 :$src1, imm:$src2)>; def : Pat<(or GR16:$src1, imm:$src2), (OR16ri GR16:$src1, imm:$src2)>; def : Pat<(or GR32:$src1, imm:$src2), (OR32ri GR32:$src1, imm:$src2)>; def : Pat<(or GR16:$src1, i16immSExt8:$src2), (OR16ri8 GR16:$src1, i16immSExt8:$src2)>; def : Pat<(or GR32:$src1, i32immSExt8:$src2), (OR32ri8 GR32:$src1, i32immSExt8:$src2)>; def : Pat<(or GR64:$src1, i64immSExt8:$src2), (OR64ri8 GR64:$src1, i64immSExt8:$src2)>; def : Pat<(or GR64:$src1, i64immSExt32:$src2), (OR64ri32 GR64:$src1, i64immSExt32:$src2)>; // xor reg/reg def : Pat<(xor GR8 :$src1, GR8 :$src2), (XOR8rr GR8 :$src1, GR8 :$src2)>; def : Pat<(xor GR16:$src1, GR16:$src2), (XOR16rr GR16:$src1, GR16:$src2)>; def : Pat<(xor GR32:$src1, GR32:$src2), (XOR32rr GR32:$src1, GR32:$src2)>; def : Pat<(xor GR64:$src1, GR64:$src2), (XOR64rr GR64:$src1, GR64:$src2)>; // xor reg/mem def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)), (XOR8rm GR8:$src1, addr:$src2)>; def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)), (XOR16rm GR16:$src1, addr:$src2)>; def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)), (XOR32rm GR32:$src1, addr:$src2)>; def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)), (XOR64rm GR64:$src1, addr:$src2)>; // xor reg/imm def : Pat<(xor GR8:$src1, imm:$src2), (XOR8ri GR8:$src1, imm:$src2)>; def : Pat<(xor GR16:$src1, imm:$src2), (XOR16ri GR16:$src1, imm:$src2)>; def : Pat<(xor GR32:$src1, imm:$src2), (XOR32ri GR32:$src1, imm:$src2)>; def : Pat<(xor GR16:$src1, i16immSExt8:$src2), (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>; def : Pat<(xor GR32:$src1, i32immSExt8:$src2), (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>; def : Pat<(xor GR64:$src1, i64immSExt8:$src2), (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>; def : Pat<(xor GR64:$src1, i64immSExt32:$src2), (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>; // and reg/reg def : Pat<(and GR8 :$src1, GR8 :$src2), (AND8rr GR8 :$src1, GR8 :$src2)>; def : Pat<(and GR16:$src1, GR16:$src2), (AND16rr GR16:$src1, GR16:$src2)>; def : Pat<(and GR32:$src1, GR32:$src2), (AND32rr GR32:$src1, GR32:$src2)>; def : Pat<(and GR64:$src1, GR64:$src2), (AND64rr GR64:$src1, GR64:$src2)>; // and reg/mem def : Pat<(and GR8:$src1, (loadi8 addr:$src2)), (AND8rm GR8:$src1, addr:$src2)>; def : Pat<(and GR16:$src1, (loadi16 addr:$src2)), (AND16rm GR16:$src1, addr:$src2)>; def : Pat<(and GR32:$src1, (loadi32 addr:$src2)), (AND32rm GR32:$src1, addr:$src2)>; def : Pat<(and GR64:$src1, (loadi64 addr:$src2)), (AND64rm GR64:$src1, addr:$src2)>; // and reg/imm def : Pat<(and GR8:$src1, imm:$src2), (AND8ri GR8:$src1, imm:$src2)>; def : Pat<(and GR16:$src1, imm:$src2), (AND16ri GR16:$src1, imm:$src2)>; def : Pat<(and GR32:$src1, imm:$src2), (AND32ri GR32:$src1, imm:$src2)>; def : Pat<(and GR16:$src1, i16immSExt8:$src2), (AND16ri8 GR16:$src1, i16immSExt8:$src2)>; def : Pat<(and GR32:$src1, i32immSExt8:$src2), (AND32ri8 GR32:$src1, i32immSExt8:$src2)>; def : Pat<(and GR64:$src1, i64immSExt8:$src2), (AND64ri8 GR64:$src1, i64immSExt8:$src2)>; def : Pat<(and GR64:$src1, i64immSExt32:$src2), (AND64ri32 GR64:$src1, i64immSExt32:$src2)>; // Bit scan instruction patterns to match explicit zero-undef behavior. def : Pat<(cttz_zero_undef GR16:$src), (BSF16rr GR16:$src)>; def : Pat<(cttz_zero_undef GR32:$src), (BSF32rr GR32:$src)>; def : Pat<(cttz_zero_undef GR64:$src), (BSF64rr GR64:$src)>; def : Pat<(cttz_zero_undef (loadi16 addr:$src)), (BSF16rm addr:$src)>; def : Pat<(cttz_zero_undef (loadi32 addr:$src)), (BSF32rm addr:$src)>; def : Pat<(cttz_zero_undef (loadi64 addr:$src)), (BSF64rm addr:$src)>; // When HasMOVBE is enabled it is possible to get a non-legalized // register-register 16 bit bswap. This maps it to a ROL instruction. let Predicates = [HasMOVBE] in { def : Pat<(bswap GR16:$src), (ROL16ri GR16:$src, (i8 8))>; } diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp index 461569345a11..6b6b8aedc9c6 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1,5922 +1,5974 @@ //===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file contains the X86 implementation of the TargetInstrInfo class. // //===----------------------------------------------------------------------===// #include "X86InstrInfo.h" #include "X86.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/StackMaps.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetOptions.h" #include using namespace llvm; #define DEBUG_TYPE "x86-instr-info" #define GET_INSTRINFO_CTOR_DTOR #include "X86GenInstrInfo.inc" static cl::opt NoFusing("disable-spill-fusing", cl::desc("Disable fusing of spill code into instructions")); static cl::opt PrintFailedFusing("print-failed-fuse-candidates", cl::desc("Print instructions that the allocator wants to" " fuse, but the X86 backend currently can't"), cl::Hidden); static cl::opt ReMatPICStubLoad("remat-pic-stub-load", cl::desc("Re-materialize load from stub in PIC mode"), cl::init(false), cl::Hidden); enum { // Select which memory operand is being unfolded. // (stored in bits 0 - 3) TB_INDEX_0 = 0, TB_INDEX_1 = 1, TB_INDEX_2 = 2, TB_INDEX_3 = 3, TB_INDEX_4 = 4, TB_INDEX_MASK = 0xf, // Do not insert the reverse map (MemOp -> RegOp) into the table. // This may be needed because there is a many -> one mapping. TB_NO_REVERSE = 1 << 4, // Do not insert the forward map (RegOp -> MemOp) into the table. // This is needed for Native Client, which prohibits branch // instructions from using a memory operand. TB_NO_FORWARD = 1 << 5, TB_FOLDED_LOAD = 1 << 6, TB_FOLDED_STORE = 1 << 7, // Minimum alignment required for load/store. // Used for RegOp->MemOp conversion. // (stored in bits 8 - 15) TB_ALIGN_SHIFT = 8, TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT, TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT, TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT, TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT, TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT }; struct X86OpTblEntry { uint16_t RegOp; uint16_t MemOp; uint16_t Flags; }; // Pin the vtable to this file. void X86InstrInfo::anchor() {} X86InstrInfo::X86InstrInfo(X86Subtarget &STI) : X86GenInstrInfo( (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64 : X86::ADJCALLSTACKDOWN32), (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64 : X86::ADJCALLSTACKUP32)), Subtarget(STI), RI(STI) { static const X86OpTblEntry OpTbl2Addr[] = { { X86::ADC32ri, X86::ADC32mi, 0 }, { X86::ADC32ri8, X86::ADC32mi8, 0 }, { X86::ADC32rr, X86::ADC32mr, 0 }, { X86::ADC64ri32, X86::ADC64mi32, 0 }, { X86::ADC64ri8, X86::ADC64mi8, 0 }, { X86::ADC64rr, X86::ADC64mr, 0 }, { X86::ADD16ri, X86::ADD16mi, 0 }, { X86::ADD16ri8, X86::ADD16mi8, 0 }, { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, { X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE }, { X86::ADD16rr, X86::ADD16mr, 0 }, { X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE }, { X86::ADD32ri, X86::ADD32mi, 0 }, { X86::ADD32ri8, X86::ADD32mi8, 0 }, { X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE }, { X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE }, { X86::ADD32rr, X86::ADD32mr, 0 }, { X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE }, { X86::ADD64ri32, X86::ADD64mi32, 0 }, { X86::ADD64ri8, X86::ADD64mi8, 0 }, { X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE }, { X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE }, { X86::ADD64rr, X86::ADD64mr, 0 }, { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, { X86::ADD8ri, X86::ADD8mi, 0 }, { X86::ADD8rr, X86::ADD8mr, 0 }, { X86::AND16ri, X86::AND16mi, 0 }, { X86::AND16ri8, X86::AND16mi8, 0 }, { X86::AND16rr, X86::AND16mr, 0 }, { X86::AND32ri, X86::AND32mi, 0 }, { X86::AND32ri8, X86::AND32mi8, 0 }, { X86::AND32rr, X86::AND32mr, 0 }, { X86::AND64ri32, X86::AND64mi32, 0 }, { X86::AND64ri8, X86::AND64mi8, 0 }, { X86::AND64rr, X86::AND64mr, 0 }, { X86::AND8ri, X86::AND8mi, 0 }, { X86::AND8rr, X86::AND8mr, 0 }, { X86::DEC16r, X86::DEC16m, 0 }, { X86::DEC32r, X86::DEC32m, 0 }, { X86::DEC64r, X86::DEC64m, 0 }, { X86::DEC8r, X86::DEC8m, 0 }, { X86::INC16r, X86::INC16m, 0 }, { X86::INC32r, X86::INC32m, 0 }, { X86::INC64r, X86::INC64m, 0 }, { X86::INC8r, X86::INC8m, 0 }, { X86::NEG16r, X86::NEG16m, 0 }, { X86::NEG32r, X86::NEG32m, 0 }, { X86::NEG64r, X86::NEG64m, 0 }, { X86::NEG8r, X86::NEG8m, 0 }, { X86::NOT16r, X86::NOT16m, 0 }, { X86::NOT32r, X86::NOT32m, 0 }, { X86::NOT64r, X86::NOT64m, 0 }, { X86::NOT8r, X86::NOT8m, 0 }, { X86::OR16ri, X86::OR16mi, 0 }, { X86::OR16ri8, X86::OR16mi8, 0 }, { X86::OR16rr, X86::OR16mr, 0 }, { X86::OR32ri, X86::OR32mi, 0 }, { X86::OR32ri8, X86::OR32mi8, 0 }, { X86::OR32rr, X86::OR32mr, 0 }, { X86::OR64ri32, X86::OR64mi32, 0 }, { X86::OR64ri8, X86::OR64mi8, 0 }, { X86::OR64rr, X86::OR64mr, 0 }, { X86::OR8ri, X86::OR8mi, 0 }, { X86::OR8rr, X86::OR8mr, 0 }, { X86::ROL16r1, X86::ROL16m1, 0 }, { X86::ROL16rCL, X86::ROL16mCL, 0 }, { X86::ROL16ri, X86::ROL16mi, 0 }, { X86::ROL32r1, X86::ROL32m1, 0 }, { X86::ROL32rCL, X86::ROL32mCL, 0 }, { X86::ROL32ri, X86::ROL32mi, 0 }, { X86::ROL64r1, X86::ROL64m1, 0 }, { X86::ROL64rCL, X86::ROL64mCL, 0 }, { X86::ROL64ri, X86::ROL64mi, 0 }, { X86::ROL8r1, X86::ROL8m1, 0 }, { X86::ROL8rCL, X86::ROL8mCL, 0 }, { X86::ROL8ri, X86::ROL8mi, 0 }, { X86::ROR16r1, X86::ROR16m1, 0 }, { X86::ROR16rCL, X86::ROR16mCL, 0 }, { X86::ROR16ri, X86::ROR16mi, 0 }, { X86::ROR32r1, X86::ROR32m1, 0 }, { X86::ROR32rCL, X86::ROR32mCL, 0 }, { X86::ROR32ri, X86::ROR32mi, 0 }, { X86::ROR64r1, X86::ROR64m1, 0 }, { X86::ROR64rCL, X86::ROR64mCL, 0 }, { X86::ROR64ri, X86::ROR64mi, 0 }, { X86::ROR8r1, X86::ROR8m1, 0 }, { X86::ROR8rCL, X86::ROR8mCL, 0 }, { X86::ROR8ri, X86::ROR8mi, 0 }, { X86::SAR16r1, X86::SAR16m1, 0 }, { X86::SAR16rCL, X86::SAR16mCL, 0 }, { X86::SAR16ri, X86::SAR16mi, 0 }, { X86::SAR32r1, X86::SAR32m1, 0 }, { X86::SAR32rCL, X86::SAR32mCL, 0 }, { X86::SAR32ri, X86::SAR32mi, 0 }, { X86::SAR64r1, X86::SAR64m1, 0 }, { X86::SAR64rCL, X86::SAR64mCL, 0 }, { X86::SAR64ri, X86::SAR64mi, 0 }, { X86::SAR8r1, X86::SAR8m1, 0 }, { X86::SAR8rCL, X86::SAR8mCL, 0 }, { X86::SAR8ri, X86::SAR8mi, 0 }, { X86::SBB32ri, X86::SBB32mi, 0 }, { X86::SBB32ri8, X86::SBB32mi8, 0 }, { X86::SBB32rr, X86::SBB32mr, 0 }, { X86::SBB64ri32, X86::SBB64mi32, 0 }, { X86::SBB64ri8, X86::SBB64mi8, 0 }, { X86::SBB64rr, X86::SBB64mr, 0 }, { X86::SHL16rCL, X86::SHL16mCL, 0 }, { X86::SHL16ri, X86::SHL16mi, 0 }, { X86::SHL32rCL, X86::SHL32mCL, 0 }, { X86::SHL32ri, X86::SHL32mi, 0 }, { X86::SHL64rCL, X86::SHL64mCL, 0 }, { X86::SHL64ri, X86::SHL64mi, 0 }, { X86::SHL8rCL, X86::SHL8mCL, 0 }, { X86::SHL8ri, X86::SHL8mi, 0 }, { X86::SHLD16rrCL, X86::SHLD16mrCL, 0 }, { X86::SHLD16rri8, X86::SHLD16mri8, 0 }, { X86::SHLD32rrCL, X86::SHLD32mrCL, 0 }, { X86::SHLD32rri8, X86::SHLD32mri8, 0 }, { X86::SHLD64rrCL, X86::SHLD64mrCL, 0 }, { X86::SHLD64rri8, X86::SHLD64mri8, 0 }, { X86::SHR16r1, X86::SHR16m1, 0 }, { X86::SHR16rCL, X86::SHR16mCL, 0 }, { X86::SHR16ri, X86::SHR16mi, 0 }, { X86::SHR32r1, X86::SHR32m1, 0 }, { X86::SHR32rCL, X86::SHR32mCL, 0 }, { X86::SHR32ri, X86::SHR32mi, 0 }, { X86::SHR64r1, X86::SHR64m1, 0 }, { X86::SHR64rCL, X86::SHR64mCL, 0 }, { X86::SHR64ri, X86::SHR64mi, 0 }, { X86::SHR8r1, X86::SHR8m1, 0 }, { X86::SHR8rCL, X86::SHR8mCL, 0 }, { X86::SHR8ri, X86::SHR8mi, 0 }, { X86::SHRD16rrCL, X86::SHRD16mrCL, 0 }, { X86::SHRD16rri8, X86::SHRD16mri8, 0 }, { X86::SHRD32rrCL, X86::SHRD32mrCL, 0 }, { X86::SHRD32rri8, X86::SHRD32mri8, 0 }, { X86::SHRD64rrCL, X86::SHRD64mrCL, 0 }, { X86::SHRD64rri8, X86::SHRD64mri8, 0 }, { X86::SUB16ri, X86::SUB16mi, 0 }, { X86::SUB16ri8, X86::SUB16mi8, 0 }, { X86::SUB16rr, X86::SUB16mr, 0 }, { X86::SUB32ri, X86::SUB32mi, 0 }, { X86::SUB32ri8, X86::SUB32mi8, 0 }, { X86::SUB32rr, X86::SUB32mr, 0 }, { X86::SUB64ri32, X86::SUB64mi32, 0 }, { X86::SUB64ri8, X86::SUB64mi8, 0 }, { X86::SUB64rr, X86::SUB64mr, 0 }, { X86::SUB8ri, X86::SUB8mi, 0 }, { X86::SUB8rr, X86::SUB8mr, 0 }, { X86::XOR16ri, X86::XOR16mi, 0 }, { X86::XOR16ri8, X86::XOR16mi8, 0 }, { X86::XOR16rr, X86::XOR16mr, 0 }, { X86::XOR32ri, X86::XOR32mi, 0 }, { X86::XOR32ri8, X86::XOR32mi8, 0 }, { X86::XOR32rr, X86::XOR32mr, 0 }, { X86::XOR64ri32, X86::XOR64mi32, 0 }, { X86::XOR64ri8, X86::XOR64mi8, 0 }, { X86::XOR64rr, X86::XOR64mr, 0 }, { X86::XOR8ri, X86::XOR8mi, 0 }, { X86::XOR8rr, X86::XOR8mr, 0 } }; for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) { unsigned RegOp = OpTbl2Addr[i].RegOp; unsigned MemOp = OpTbl2Addr[i].MemOp; unsigned Flags = OpTbl2Addr[i].Flags; AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable, RegOp, MemOp, // Index 0, folded load and store, no alignment requirement. Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE); } static const X86OpTblEntry OpTbl0[] = { { X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD }, { X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD }, { X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD }, { X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD }, { X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD }, { X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD }, { X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD }, { X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD }, { X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD }, { X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD }, { X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD }, { X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD }, { X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD }, { X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD }, { X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD }, { X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD }, { X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD }, { X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD }, { X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD }, { X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD }, { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE }, { X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD }, { X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD }, { X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD }, { X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD }, { X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD }, { X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD }, { X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD }, { X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD }, { X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD }, { X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD }, { X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE }, { X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE }, { X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE }, { X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE }, { X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE }, { X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE }, { X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE }, { X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE }, { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE }, { X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE }, { X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE }, { X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE }, { X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE }, { X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE }, { X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE }, { X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD }, { X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD }, { X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD }, { X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD }, { X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE }, { X86::SETAr, X86::SETAm, TB_FOLDED_STORE }, { X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE }, { X86::SETBr, X86::SETBm, TB_FOLDED_STORE }, { X86::SETEr, X86::SETEm, TB_FOLDED_STORE }, { X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE }, { X86::SETGr, X86::SETGm, TB_FOLDED_STORE }, { X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE }, { X86::SETLr, X86::SETLm, TB_FOLDED_STORE }, { X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE }, { X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE }, { X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE }, { X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE }, { X86::SETOr, X86::SETOm, TB_FOLDED_STORE }, { X86::SETPr, X86::SETPm, TB_FOLDED_STORE }, { X86::SETSr, X86::SETSm, TB_FOLDED_STORE }, { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD }, { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD }, { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD }, { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, // AVX 128-bit versions of foldable instructions { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE }, { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE }, { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE }, { X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE }, { X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE }, { X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE }, { X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE }, // AVX 256-bit foldable instructions { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE }, { X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE }, // AVX-512 foldable instructions { X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE }, { X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 }, { X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE }, { X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE }, { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE }, { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE }, // AVX-512 foldable instructions (256-bit versions) { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 }, { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE }, { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE }, { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE }, { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE }, // AVX-512 foldable instructions (128-bit versions) { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 }, { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE }, { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE }, { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE }, { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE }, { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE }, { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE } }; for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) { unsigned RegOp = OpTbl0[i].RegOp; unsigned MemOp = OpTbl0[i].MemOp; unsigned Flags = OpTbl0[i].Flags; AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable, RegOp, MemOp, TB_INDEX_0 | Flags); } static const X86OpTblEntry OpTbl1[] = { { X86::CMP16rr, X86::CMP16rm, 0 }, { X86::CMP32rr, X86::CMP32rm, 0 }, { X86::CMP64rr, X86::CMP64rm, 0 }, { X86::CMP8rr, X86::CMP8rm, 0 }, { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 }, { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 }, { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 }, { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 }, { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 }, { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 }, { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 }, { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 }, { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 }, { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 }, { X86::IMUL16rri, X86::IMUL16rmi, 0 }, { X86::IMUL16rri8, X86::IMUL16rmi8, 0 }, { X86::IMUL32rri, X86::IMUL32rmi, 0 }, { X86::IMUL32rri8, X86::IMUL32rmi8, 0 }, { X86::IMUL64rri32, X86::IMUL64rmi32, 0 }, { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, { X86::Int_COMISDrr, X86::Int_COMISDrm, 0 }, { X86::Int_COMISSrr, X86::Int_COMISSrm, 0 }, { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, 0 }, { X86::CVTSD2SIrr, X86::CVTSD2SIrm, 0 }, { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, 0 }, { X86::CVTSS2SIrr, X86::CVTSS2SIrm, 0 }, { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 }, { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 }, { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 }, { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 }, { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, 0 }, { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, 0 }, { X86::MOV16rr, X86::MOV16rm, 0 }, { X86::MOV32rr, X86::MOV32rm, 0 }, { X86::MOV64rr, X86::MOV64rm, 0 }, { X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 }, { X86::MOV64toSDrr, X86::MOV64toSDrm, 0 }, { X86::MOV8rr, X86::MOV8rm, 0 }, { X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 }, { X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 }, { X86::MOVDDUPrr, X86::MOVDDUPrm, 0 }, { X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 }, { X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 }, { X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 }, { X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 }, { X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 }, { X86::MOVSX16rr8, X86::MOVSX16rm8, 0 }, { X86::MOVSX32rr16, X86::MOVSX32rm16, 0 }, { X86::MOVSX32rr8, X86::MOVSX32rm8, 0 }, { X86::MOVSX64rr16, X86::MOVSX64rm16, 0 }, { X86::MOVSX64rr32, X86::MOVSX64rm32, 0 }, { X86::MOVSX64rr8, X86::MOVSX64rm8, 0 }, { X86::MOVUPDrr, X86::MOVUPDrm, TB_ALIGN_16 }, { X86::MOVUPSrr, X86::MOVUPSrm, 0 }, { X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 }, { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::MOVZX16rr8, X86::MOVZX16rm8, 0 }, { X86::MOVZX32rr16, X86::MOVZX32rm16, 0 }, { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 }, { X86::MOVZX32rr8, X86::MOVZX32rm8, 0 }, { X86::PABSBrr128, X86::PABSBrm128, TB_ALIGN_16 }, { X86::PABSDrr128, X86::PABSDrm128, TB_ALIGN_16 }, { X86::PABSWrr128, X86::PABSWrm128, TB_ALIGN_16 }, { X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 }, { X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 }, { X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 }, { X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 }, { X86::RCPPSr_Int, X86::RCPPSm_Int, TB_ALIGN_16 }, { X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 }, { X86::RSQRTPSr_Int, X86::RSQRTPSm_Int, TB_ALIGN_16 }, { X86::RSQRTSSr, X86::RSQRTSSm, 0 }, { X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 }, { X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 }, { X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 }, { X86::SQRTSDr, X86::SQRTSDm, 0 }, { X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 }, { X86::SQRTSSr, X86::SQRTSSm, 0 }, { X86::SQRTSSr_Int, X86::SQRTSSm_Int, 0 }, { X86::TEST16rr, X86::TEST16rm, 0 }, { X86::TEST32rr, X86::TEST32rm, 0 }, { X86::TEST64rr, X86::TEST64rm, 0 }, { X86::TEST8rr, X86::TEST8rm, 0 }, // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 { X86::UCOMISDrr, X86::UCOMISDrm, 0 }, { X86::UCOMISSrr, X86::UCOMISSrm, 0 }, // AVX 128-bit versions of foldable instructions { X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, 0 }, { X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, 0 }, { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, 0 }, { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, 0 }, { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,0 }, { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, 0 }, { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,0 }, { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, 0 }, { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, 0 }, { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, 0 }, { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, 0 }, { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, 0 }, { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, { X86::VCVTPD2DQrr, X86::VCVTPD2DQXrm, 0 }, { X86::VCVTPD2PSrr, X86::VCVTPD2PSXrm, 0 }, { X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 }, { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 }, { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 }, { X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 }, { X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 }, { X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 }, { X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 }, { X86::VMOVDDUPrr, X86::VMOVDDUPrm, 0 }, { X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 }, { X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 }, { X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 }, { X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, TB_ALIGN_16 }, { X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 }, { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 }, { X86::VMOVUPSrr, X86::VMOVUPSrm, 0 }, { X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 }, { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 }, { X86::VPABSBrr128, X86::VPABSBrm128, 0 }, { X86::VPABSDrr128, X86::VPABSDrm128, 0 }, { X86::VPABSWrr128, X86::VPABSWrm128, 0 }, { X86::VPERMILPDri, X86::VPERMILPDmi, 0 }, { X86::VPERMILPSri, X86::VPERMILPSmi, 0 }, { X86::VPSHUFDri, X86::VPSHUFDmi, 0 }, { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 }, { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 }, { X86::VRCPPSr, X86::VRCPPSm, 0 }, { X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 }, { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 }, { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 }, { X86::VSQRTPDr, X86::VSQRTPDm, 0 }, { X86::VSQRTPSr, X86::VSQRTPSm, 0 }, { X86::VUCOMISDrr, X86::VUCOMISDrm, 0 }, { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, { X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE }, // AVX 256-bit foldable instructions { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, { X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 }, { X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 }, { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 }, { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 }, { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 }, { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 }, { X86::VRCPPSYr, X86::VRCPPSYm, 0 }, { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 }, { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 }, { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 }, { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 }, { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE }, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE }, // AVX2 foldable instructions { X86::VPABSBrr256, X86::VPABSBrm256, 0 }, { X86::VPABSDrr256, X86::VPABSDrm256, 0 }, { X86::VPABSWrr256, X86::VPABSWrm256, 0 }, { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 }, { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 }, { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 }, // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions { X86::BEXTR32rr, X86::BEXTR32rm, 0 }, { X86::BEXTR64rr, X86::BEXTR64rm, 0 }, { X86::BEXTRI32ri, X86::BEXTRI32mi, 0 }, { X86::BEXTRI64ri, X86::BEXTRI64mi, 0 }, { X86::BLCFILL32rr, X86::BLCFILL32rm, 0 }, { X86::BLCFILL64rr, X86::BLCFILL64rm, 0 }, { X86::BLCI32rr, X86::BLCI32rm, 0 }, { X86::BLCI64rr, X86::BLCI64rm, 0 }, { X86::BLCIC32rr, X86::BLCIC32rm, 0 }, { X86::BLCIC64rr, X86::BLCIC64rm, 0 }, { X86::BLCMSK32rr, X86::BLCMSK32rm, 0 }, { X86::BLCMSK64rr, X86::BLCMSK64rm, 0 }, { X86::BLCS32rr, X86::BLCS32rm, 0 }, { X86::BLCS64rr, X86::BLCS64rm, 0 }, { X86::BLSFILL32rr, X86::BLSFILL32rm, 0 }, { X86::BLSFILL64rr, X86::BLSFILL64rm, 0 }, { X86::BLSI32rr, X86::BLSI32rm, 0 }, { X86::BLSI64rr, X86::BLSI64rm, 0 }, { X86::BLSIC32rr, X86::BLSIC32rm, 0 }, { X86::BLSIC64rr, X86::BLSIC64rm, 0 }, { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 }, { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 }, { X86::BLSR32rr, X86::BLSR32rm, 0 }, { X86::BLSR64rr, X86::BLSR64rm, 0 }, { X86::BZHI32rr, X86::BZHI32rm, 0 }, { X86::BZHI64rr, X86::BZHI64rm, 0 }, { X86::LZCNT16rr, X86::LZCNT16rm, 0 }, { X86::LZCNT32rr, X86::LZCNT32rm, 0 }, { X86::LZCNT64rr, X86::LZCNT64rm, 0 }, { X86::POPCNT16rr, X86::POPCNT16rm, 0 }, { X86::POPCNT32rr, X86::POPCNT32rm, 0 }, { X86::POPCNT64rr, X86::POPCNT64rm, 0 }, { X86::RORX32ri, X86::RORX32mi, 0 }, { X86::RORX64ri, X86::RORX64mi, 0 }, { X86::SARX32rr, X86::SARX32rm, 0 }, { X86::SARX64rr, X86::SARX64rm, 0 }, { X86::SHRX32rr, X86::SHRX32rm, 0 }, { X86::SHRX64rr, X86::SHRX64rm, 0 }, { X86::SHLX32rr, X86::SHLX32rm, 0 }, { X86::SHLX64rr, X86::SHLX64rm, 0 }, { X86::T1MSKC32rr, X86::T1MSKC32rm, 0 }, { X86::T1MSKC64rr, X86::T1MSKC64rm, 0 }, { X86::TZCNT16rr, X86::TZCNT16rm, 0 }, { X86::TZCNT32rr, X86::TZCNT32rm, 0 }, { X86::TZCNT64rr, X86::TZCNT64rm, 0 }, { X86::TZMSK32rr, X86::TZMSK32rm, 0 }, { X86::TZMSK64rr, X86::TZMSK64rm, 0 }, // AVX-512 foldable instructions { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, { X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 }, { X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 }, { X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 }, { X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 }, { X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 }, { X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 }, { X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 }, { X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 }, { X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 }, { X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 }, { X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 }, { X86::VPABSDZrr, X86::VPABSDZrm, 0 }, { X86::VPABSQZrr, X86::VPABSQZrm, 0 }, { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, // AVX-512 foldable instructions (256-bit versions) { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, { X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 }, { X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 }, { X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 }, { X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 }, { X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 }, { X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 }, { X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 }, { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, // AVX-512 foldable instructions (256-bit versions) { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, { X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 }, { X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 }, { X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 }, { X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 }, { X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 }, { X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 }, { X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 }, { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, // AES foldable instructions { X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 }, { X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 }, { X86::VAESIMCrr, X86::VAESIMCrm, TB_ALIGN_16 }, { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 } }; for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) { unsigned RegOp = OpTbl1[i].RegOp; unsigned MemOp = OpTbl1[i].MemOp; unsigned Flags = OpTbl1[i].Flags; AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable, RegOp, MemOp, // Index 1, folded load Flags | TB_INDEX_1 | TB_FOLDED_LOAD); } static const X86OpTblEntry OpTbl2[] = { { X86::ADC32rr, X86::ADC32rm, 0 }, { X86::ADC64rr, X86::ADC64rm, 0 }, { X86::ADD16rr, X86::ADD16rm, 0 }, { X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE }, { X86::ADD32rr, X86::ADD32rm, 0 }, { X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE }, { X86::ADD64rr, X86::ADD64rm, 0 }, { X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE }, { X86::ADD8rr, X86::ADD8rm, 0 }, { X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 }, { X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 }, { X86::ADDSDrr, X86::ADDSDrm, 0 }, { X86::ADDSSrr, X86::ADDSSrm, 0 }, { X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 }, { X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 }, { X86::AND16rr, X86::AND16rm, 0 }, { X86::AND32rr, X86::AND32rm, 0 }, { X86::AND64rr, X86::AND64rm, 0 }, { X86::AND8rr, X86::AND8rm, 0 }, { X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 }, { X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 }, { X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 }, { X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 }, { X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 }, { X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 }, { X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 }, { X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 }, { X86::CMOVA16rr, X86::CMOVA16rm, 0 }, { X86::CMOVA32rr, X86::CMOVA32rm, 0 }, { X86::CMOVA64rr, X86::CMOVA64rm, 0 }, { X86::CMOVAE16rr, X86::CMOVAE16rm, 0 }, { X86::CMOVAE32rr, X86::CMOVAE32rm, 0 }, { X86::CMOVAE64rr, X86::CMOVAE64rm, 0 }, { X86::CMOVB16rr, X86::CMOVB16rm, 0 }, { X86::CMOVB32rr, X86::CMOVB32rm, 0 }, { X86::CMOVB64rr, X86::CMOVB64rm, 0 }, { X86::CMOVBE16rr, X86::CMOVBE16rm, 0 }, { X86::CMOVBE32rr, X86::CMOVBE32rm, 0 }, { X86::CMOVBE64rr, X86::CMOVBE64rm, 0 }, { X86::CMOVE16rr, X86::CMOVE16rm, 0 }, { X86::CMOVE32rr, X86::CMOVE32rm, 0 }, { X86::CMOVE64rr, X86::CMOVE64rm, 0 }, { X86::CMOVG16rr, X86::CMOVG16rm, 0 }, { X86::CMOVG32rr, X86::CMOVG32rm, 0 }, { X86::CMOVG64rr, X86::CMOVG64rm, 0 }, { X86::CMOVGE16rr, X86::CMOVGE16rm, 0 }, { X86::CMOVGE32rr, X86::CMOVGE32rm, 0 }, { X86::CMOVGE64rr, X86::CMOVGE64rm, 0 }, { X86::CMOVL16rr, X86::CMOVL16rm, 0 }, { X86::CMOVL32rr, X86::CMOVL32rm, 0 }, { X86::CMOVL64rr, X86::CMOVL64rm, 0 }, { X86::CMOVLE16rr, X86::CMOVLE16rm, 0 }, { X86::CMOVLE32rr, X86::CMOVLE32rm, 0 }, { X86::CMOVLE64rr, X86::CMOVLE64rm, 0 }, { X86::CMOVNE16rr, X86::CMOVNE16rm, 0 }, { X86::CMOVNE32rr, X86::CMOVNE32rm, 0 }, { X86::CMOVNE64rr, X86::CMOVNE64rm, 0 }, { X86::CMOVNO16rr, X86::CMOVNO16rm, 0 }, { X86::CMOVNO32rr, X86::CMOVNO32rm, 0 }, { X86::CMOVNO64rr, X86::CMOVNO64rm, 0 }, { X86::CMOVNP16rr, X86::CMOVNP16rm, 0 }, { X86::CMOVNP32rr, X86::CMOVNP32rm, 0 }, { X86::CMOVNP64rr, X86::CMOVNP64rm, 0 }, { X86::CMOVNS16rr, X86::CMOVNS16rm, 0 }, { X86::CMOVNS32rr, X86::CMOVNS32rm, 0 }, { X86::CMOVNS64rr, X86::CMOVNS64rm, 0 }, { X86::CMOVO16rr, X86::CMOVO16rm, 0 }, { X86::CMOVO32rr, X86::CMOVO32rm, 0 }, { X86::CMOVO64rr, X86::CMOVO64rm, 0 }, { X86::CMOVP16rr, X86::CMOVP16rm, 0 }, { X86::CMOVP32rr, X86::CMOVP32rm, 0 }, { X86::CMOVP64rr, X86::CMOVP64rm, 0 }, { X86::CMOVS16rr, X86::CMOVS16rm, 0 }, { X86::CMOVS32rr, X86::CMOVS32rm, 0 }, { X86::CMOVS64rr, X86::CMOVS64rm, 0 }, { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 }, { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 }, { X86::CMPSDrr, X86::CMPSDrm, 0 }, { X86::CMPSSrr, X86::CMPSSrm, 0 }, { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, { X86::DIVSDrr, X86::DIVSDrm, 0 }, { X86::DIVSSrr, X86::DIVSSrm, 0 }, { X86::FsANDNPDrr, X86::FsANDNPDrm, TB_ALIGN_16 }, { X86::FsANDNPSrr, X86::FsANDNPSrm, TB_ALIGN_16 }, { X86::FsANDPDrr, X86::FsANDPDrm, TB_ALIGN_16 }, { X86::FsANDPSrr, X86::FsANDPSrm, TB_ALIGN_16 }, { X86::FsORPDrr, X86::FsORPDrm, TB_ALIGN_16 }, { X86::FsORPSrr, X86::FsORPSrm, TB_ALIGN_16 }, { X86::FsXORPDrr, X86::FsXORPDrm, TB_ALIGN_16 }, { X86::FsXORPSrr, X86::FsXORPSrm, TB_ALIGN_16 }, { X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 }, { X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 }, { X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 }, { X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 }, { X86::IMUL16rr, X86::IMUL16rm, 0 }, { X86::IMUL32rr, X86::IMUL32rm, 0 }, { X86::IMUL64rr, X86::IMUL64rm, 0 }, { X86::Int_CMPSDrr, X86::Int_CMPSDrm, 0 }, { X86::Int_CMPSSrr, X86::Int_CMPSSrm, 0 }, { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, 0 }, { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 }, { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 }, { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 }, { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 }, { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 }, { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, { X86::MAXSDrr, X86::MAXSDrm, 0 }, { X86::MAXSSrr, X86::MAXSSrm, 0 }, { X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 }, { X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 }, { X86::MINSDrr, X86::MINSDrm, 0 }, { X86::MINSSrr, X86::MINSSrm, 0 }, { X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 }, { X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 }, { X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 }, { X86::MULSDrr, X86::MULSDrm, 0 }, { X86::MULSSrr, X86::MULSSrm, 0 }, { X86::OR16rr, X86::OR16rm, 0 }, { X86::OR32rr, X86::OR32rm, 0 }, { X86::OR64rr, X86::OR64rm, 0 }, { X86::OR8rr, X86::OR8rm, 0 }, { X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 }, { X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 }, { X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 }, { X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 }, { X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 }, { X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 }, { X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 }, { X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 }, { X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 }, { X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 }, { X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 }, { X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 }, { X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 }, { X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 }, { X86::PALIGNR128rr, X86::PALIGNR128rm, TB_ALIGN_16 }, { X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 }, { X86::PANDrr, X86::PANDrm, TB_ALIGN_16 }, { X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 }, { X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 }, { X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 }, { X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 }, { X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 }, { X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 }, { X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 }, { X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 }, { X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 }, { X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 }, { X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 }, { X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 }, { X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 }, { X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 }, { X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 }, { X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 }, { X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 }, { X86::PINSRWrri, X86::PINSRWrmi, TB_ALIGN_16 }, { X86::PMADDUBSWrr128, X86::PMADDUBSWrm128, TB_ALIGN_16 }, { X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 }, { X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 }, { X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 }, { X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 }, { X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 }, { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 }, { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 }, { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 }, { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 }, { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 }, { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 }, { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 }, { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 }, { X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 }, { X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 }, { X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 }, { X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 }, { X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 }, { X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 }, { X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 }, { X86::PORrr, X86::PORrm, TB_ALIGN_16 }, { X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 }, { X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 }, { X86::PSIGNBrr, X86::PSIGNBrm, TB_ALIGN_16 }, { X86::PSIGNWrr, X86::PSIGNWrm, TB_ALIGN_16 }, { X86::PSIGNDrr, X86::PSIGNDrm, TB_ALIGN_16 }, { X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 }, { X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 }, { X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 }, { X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 }, { X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 }, { X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 }, { X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 }, { X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 }, { X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 }, { X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 }, { X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 }, { X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 }, { X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 }, { X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 }, { X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 }, { X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 }, { X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 }, { X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 }, { X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 }, { X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 }, { X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 }, { X86::PXORrr, X86::PXORrm, TB_ALIGN_16 }, { X86::SBB32rr, X86::SBB32rm, 0 }, { X86::SBB64rr, X86::SBB64rm, 0 }, { X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 }, { X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 }, { X86::SUB16rr, X86::SUB16rm, 0 }, { X86::SUB32rr, X86::SUB32rm, 0 }, { X86::SUB64rr, X86::SUB64rm, 0 }, { X86::SUB8rr, X86::SUB8rm, 0 }, { X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 }, { X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 }, { X86::SUBSDrr, X86::SUBSDrm, 0 }, { X86::SUBSSrr, X86::SUBSSrm, 0 }, // FIXME: TEST*rr -> swapped operand of TEST*mr. { X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 }, { X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 }, { X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 }, { X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 }, { X86::XOR16rr, X86::XOR16rm, 0 }, { X86::XOR32rr, X86::XOR32rm, 0 }, { X86::XOR64rr, X86::XOR64rm, 0 }, { X86::XOR8rr, X86::XOR8rm, 0 }, { X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 }, { X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 }, // AVX 128-bit versions of foldable instructions { X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 }, { X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, 0 }, { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 }, { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 }, { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 }, { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 }, { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 }, { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 }, { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 }, { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, { X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 }, { X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 }, { X86::VRSQRTSSr, X86::VRSQRTSSm, 0 }, { X86::VSQRTSDr, X86::VSQRTSDm, 0 }, { X86::VSQRTSSr, X86::VSQRTSSm, 0 }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, { X86::VADDPSrr, X86::VADDPSrm, 0 }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, { X86::VADDSSrr, X86::VADDSSrm, 0 }, { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 }, { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 }, { X86::VANDNPDrr, X86::VANDNPDrm, 0 }, { X86::VANDNPSrr, X86::VANDNPSrm, 0 }, { X86::VANDPDrr, X86::VANDPDrm, 0 }, { X86::VANDPSrr, X86::VANDPSrm, 0 }, { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 }, { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 }, { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 }, { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 }, { X86::VCMPPDrri, X86::VCMPPDrmi, 0 }, { X86::VCMPPSrri, X86::VCMPPSrmi, 0 }, { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, { X86::VCMPSSrr, X86::VCMPSSrm, 0 }, { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, { X86::VDIVSSrr, X86::VDIVSSrm, 0 }, { X86::VFsANDNPDrr, X86::VFsANDNPDrm, TB_ALIGN_16 }, { X86::VFsANDNPSrr, X86::VFsANDNPSrm, TB_ALIGN_16 }, { X86::VFsANDPDrr, X86::VFsANDPDrm, TB_ALIGN_16 }, { X86::VFsANDPSrr, X86::VFsANDPSrm, TB_ALIGN_16 }, { X86::VFsORPDrr, X86::VFsORPDrm, TB_ALIGN_16 }, { X86::VFsORPSrr, X86::VFsORPSrm, TB_ALIGN_16 }, { X86::VFsXORPDrr, X86::VFsXORPDrm, TB_ALIGN_16 }, { X86::VFsXORPSrr, X86::VFsXORPSrm, TB_ALIGN_16 }, { X86::VHADDPDrr, X86::VHADDPDrm, 0 }, { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 }, { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 }, { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 }, { X86::VMAXPDrr, X86::VMAXPDrm, 0 }, { X86::VMAXPSrr, X86::VMAXPSrm, 0 }, { X86::VMAXSDrr, X86::VMAXSDrm, 0 }, { X86::VMAXSSrr, X86::VMAXSSrm, 0 }, { X86::VMINPDrr, X86::VMINPDrm, 0 }, { X86::VMINPSrr, X86::VMINPSrm, 0 }, { X86::VMINSDrr, X86::VMINSDrm, 0 }, { X86::VMINSSrr, X86::VMINSSrm, 0 }, { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 }, { X86::VMULPDrr, X86::VMULPDrm, 0 }, { X86::VMULPSrr, X86::VMULPSrm, 0 }, { X86::VMULSDrr, X86::VMULSDrm, 0 }, { X86::VMULSSrr, X86::VMULSSrm, 0 }, { X86::VORPDrr, X86::VORPDrm, 0 }, { X86::VORPSrr, X86::VORPSrm, 0 }, { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 }, { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 }, { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 }, { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 }, { X86::VPADDBrr, X86::VPADDBrm, 0 }, { X86::VPADDDrr, X86::VPADDDrm, 0 }, { X86::VPADDQrr, X86::VPADDQrm, 0 }, { X86::VPADDSBrr, X86::VPADDSBrm, 0 }, { X86::VPADDSWrr, X86::VPADDSWrm, 0 }, { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 }, { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 }, { X86::VPADDWrr, X86::VPADDWrm, 0 }, { X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 }, { X86::VPANDNrr, X86::VPANDNrm, 0 }, { X86::VPANDrr, X86::VPANDrm, 0 }, { X86::VPAVGBrr, X86::VPAVGBrm, 0 }, { X86::VPAVGWrr, X86::VPAVGWrm, 0 }, { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 }, { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 }, { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 }, { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 }, { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 }, { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 }, { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 }, { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 }, { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 }, { X86::VPHADDDrr, X86::VPHADDDrm, 0 }, { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 }, { X86::VPHADDWrr, X86::VPHADDWrm, 0 }, { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 }, { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 }, { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 }, { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 }, { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 }, { X86::VPINSRWrri, X86::VPINSRWrmi, 0 }, { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 }, { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 }, { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 }, { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 }, { X86::VPMINSWrr, X86::VPMINSWrm, 0 }, { X86::VPMINUBrr, X86::VPMINUBrm, 0 }, { X86::VPMINSBrr, X86::VPMINSBrm, 0 }, { X86::VPMINSDrr, X86::VPMINSDrm, 0 }, { X86::VPMINUDrr, X86::VPMINUDrm, 0 }, { X86::VPMINUWrr, X86::VPMINUWrm, 0 }, { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 }, { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 }, { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 }, { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 }, { X86::VPMULDQrr, X86::VPMULDQrm, 0 }, { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 }, { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 }, { X86::VPMULHWrr, X86::VPMULHWrm, 0 }, { X86::VPMULLDrr, X86::VPMULLDrm, 0 }, { X86::VPMULLWrr, X86::VPMULLWrm, 0 }, { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 }, { X86::VPORrr, X86::VPORrm, 0 }, { X86::VPSADBWrr, X86::VPSADBWrm, 0 }, { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 }, { X86::VPSIGNBrr, X86::VPSIGNBrm, 0 }, { X86::VPSIGNWrr, X86::VPSIGNWrm, 0 }, { X86::VPSIGNDrr, X86::VPSIGNDrm, 0 }, { X86::VPSLLDrr, X86::VPSLLDrm, 0 }, { X86::VPSLLQrr, X86::VPSLLQrm, 0 }, { X86::VPSLLWrr, X86::VPSLLWrm, 0 }, { X86::VPSRADrr, X86::VPSRADrm, 0 }, { X86::VPSRAWrr, X86::VPSRAWrm, 0 }, { X86::VPSRLDrr, X86::VPSRLDrm, 0 }, { X86::VPSRLQrr, X86::VPSRLQrm, 0 }, { X86::VPSRLWrr, X86::VPSRLWrm, 0 }, { X86::VPSUBBrr, X86::VPSUBBrm, 0 }, { X86::VPSUBDrr, X86::VPSUBDrm, 0 }, { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 }, { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 }, { X86::VPSUBWrr, X86::VPSUBWrm, 0 }, { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 }, { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 }, { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 }, { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 }, { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 }, { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 }, { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 }, { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 }, { X86::VPXORrr, X86::VPXORrm, 0 }, { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 }, { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 }, { X86::VSUBPDrr, X86::VSUBPDrm, 0 }, { X86::VSUBPSrr, X86::VSUBPSrm, 0 }, { X86::VSUBSDrr, X86::VSUBSDrm, 0 }, { X86::VSUBSSrr, X86::VSUBSSrm, 0 }, { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 }, { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 }, { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 }, { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 }, { X86::VXORPDrr, X86::VXORPDrm, 0 }, { X86::VXORPSrr, X86::VXORPSrm, 0 }, // AVX 256-bit foldable instructions { X86::VADDPDYrr, X86::VADDPDYrm, 0 }, { X86::VADDPSYrr, X86::VADDPSYrm, 0 }, { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 }, { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 }, { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 }, { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 }, { X86::VANDPDYrr, X86::VANDPDYrm, 0 }, { X86::VANDPSYrr, X86::VANDPSYrm, 0 }, { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 }, { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 }, { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 }, { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 }, { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 }, { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 }, { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 }, { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 }, { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 }, { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 }, { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 }, { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 }, { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 }, { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 }, { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 }, { X86::VMINPDYrr, X86::VMINPDYrm, 0 }, { X86::VMINPSYrr, X86::VMINPSYrm, 0 }, { X86::VMULPDYrr, X86::VMULPDYrm, 0 }, { X86::VMULPSYrr, X86::VMULPSYrm, 0 }, { X86::VORPDYrr, X86::VORPDYrm, 0 }, { X86::VORPSYrr, X86::VORPSYrm, 0 }, { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 }, { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 }, { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 }, { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 }, { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 }, { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 }, { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 }, { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 }, { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 }, { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 }, { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 }, { X86::VXORPDYrr, X86::VXORPDYrm, 0 }, { X86::VXORPSYrr, X86::VXORPSYrm, 0 }, // AVX2 foldable instructions { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 }, { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 }, { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 }, { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 }, { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 }, { X86::VPADDBYrr, X86::VPADDBYrm, 0 }, { X86::VPADDDYrr, X86::VPADDDYrm, 0 }, { X86::VPADDQYrr, X86::VPADDQYrm, 0 }, { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 }, { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 }, { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 }, { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 }, { X86::VPADDWYrr, X86::VPADDWYrm, 0 }, { X86::VPALIGNR256rr, X86::VPALIGNR256rm, 0 }, { X86::VPANDNYrr, X86::VPANDNYrm, 0 }, { X86::VPANDYrr, X86::VPANDYrm, 0 }, { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 }, { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 }, { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 }, { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 }, { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 }, { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 }, { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 }, { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 }, { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 }, { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 }, { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 }, { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 }, { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 }, { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 }, { X86::VPERMDYrr, X86::VPERMDYrm, 0 }, { X86::VPERMPDYri, X86::VPERMPDYmi, 0 }, { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 }, { X86::VPERMQYri, X86::VPERMQYmi, 0 }, { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 }, { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 }, { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 }, { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 }, { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 }, { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 }, { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 }, { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 }, { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 }, { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 }, { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 }, { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 }, { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 }, { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 }, { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 }, { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 }, { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 }, { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 }, { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 }, { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 }, { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 }, { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 }, { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 }, { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 }, { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 }, { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 }, { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 }, { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 }, { X86::VPORYrr, X86::VPORYrm, 0 }, { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 }, { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 }, { X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 }, { X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 }, { X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 }, { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 }, { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 }, { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 }, { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 }, { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 }, { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 }, { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 }, { X86::VPSRADYrr, X86::VPSRADYrm, 0 }, { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 }, { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 }, { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 }, { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 }, { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 }, { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 }, { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 }, { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 }, { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 }, { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 }, { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 }, { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 }, { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 }, { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 }, { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 }, { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 }, { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 }, { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 }, { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 }, { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 }, { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 }, { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 }, { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 }, { X86::VPXORYrr, X86::VPXORYrm, 0 }, // FIXME: add AVX 256-bit foldable instructions // FMA4 foldable patterns { X86::VFMADDSS4rr, X86::VFMADDSS4mr, 0 }, { X86::VFMADDSD4rr, X86::VFMADDSD4mr, 0 }, { X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_16 }, { X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_16 }, { X86::VFMADDPS4rrY, X86::VFMADDPS4mrY, TB_ALIGN_32 }, { X86::VFMADDPD4rrY, X86::VFMADDPD4mrY, TB_ALIGN_32 }, { X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, 0 }, { X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, 0 }, { X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_16 }, { X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_16 }, { X86::VFNMADDPS4rrY, X86::VFNMADDPS4mrY, TB_ALIGN_32 }, { X86::VFNMADDPD4rrY, X86::VFNMADDPD4mrY, TB_ALIGN_32 }, { X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, 0 }, { X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, 0 }, { X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_16 }, { X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_16 }, { X86::VFMSUBPS4rrY, X86::VFMSUBPS4mrY, TB_ALIGN_32 }, { X86::VFMSUBPD4rrY, X86::VFMSUBPD4mrY, TB_ALIGN_32 }, { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, 0 }, { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, 0 }, { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_16 }, { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_16 }, { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4mrY, TB_ALIGN_32 }, { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4mrY, TB_ALIGN_32 }, { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_16 }, { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_16 }, { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4mrY, TB_ALIGN_32 }, { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4mrY, TB_ALIGN_32 }, { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_16 }, { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_16 }, { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4mrY, TB_ALIGN_32 }, { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_32 }, // BMI/BMI2 foldable instructions { X86::ANDN32rr, X86::ANDN32rm, 0 }, { X86::ANDN64rr, X86::ANDN64rm, 0 }, { X86::MULX32rr, X86::MULX32rm, 0 }, { X86::MULX64rr, X86::MULX64rm, 0 }, { X86::PDEP32rr, X86::PDEP32rm, 0 }, { X86::PDEP64rr, X86::PDEP64rm, 0 }, { X86::PEXT32rr, X86::PEXT32rm, 0 }, { X86::PEXT64rr, X86::PEXT64rm, 0 }, // AVX-512 foldable instructions { X86::VADDPSZrr, X86::VADDPSZrm, 0 }, { X86::VADDPDZrr, X86::VADDPDZrm, 0 }, { X86::VSUBPSZrr, X86::VSUBPSZrm, 0 }, { X86::VSUBPDZrr, X86::VSUBPDZrm, 0 }, { X86::VMULPSZrr, X86::VMULPSZrm, 0 }, { X86::VMULPDZrr, X86::VMULPDZrm, 0 }, { X86::VDIVPSZrr, X86::VDIVPSZrm, 0 }, { X86::VDIVPDZrr, X86::VDIVPDZrm, 0 }, { X86::VMINPSZrr, X86::VMINPSZrm, 0 }, { X86::VMINPDZrr, X86::VMINPDZrm, 0 }, { X86::VMAXPSZrr, X86::VMAXPSZrm, 0 }, { X86::VMAXPDZrr, X86::VMAXPDZrm, 0 }, { X86::VPADDDZrr, X86::VPADDDZrm, 0 }, { X86::VPADDQZrr, X86::VPADDQZrm, 0 }, { X86::VPERMPDZri, X86::VPERMPDZmi, 0 }, { X86::VPERMPSZrr, X86::VPERMPSZrm, 0 }, { X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 }, { X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 }, { X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 }, { X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 }, { X86::VPMINSDZrr, X86::VPMINSDZrm, 0 }, { X86::VPMINSQZrr, X86::VPMINSQZrm, 0 }, { X86::VPMINUDZrr, X86::VPMINUDZrm, 0 }, { X86::VPMINUQZrr, X86::VPMINUQZrm, 0 }, { X86::VPMULDQZrr, X86::VPMULDQZrm, 0 }, { X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 }, { X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 }, { X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 }, { X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 }, { X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 }, { X86::VPSUBDZrr, X86::VPSUBDZrm, 0 }, { X86::VPSUBQZrr, X86::VPSUBQZrm, 0 }, { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, { X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 }, { X86::VALIGNQrri, X86::VALIGNQrmi, 0 }, { X86::VALIGNDrri, X86::VALIGNDrmi, 0 }, { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, { X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE }, { X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE }, // AVX-512{F,VL} foldable instructions { X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE }, { X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE }, { X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE }, // AVX-512{F,VL} foldable instructions { X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 }, { X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 }, { X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 }, { X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 }, // AES foldable instructions { X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 }, { X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 }, { X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 }, { X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 }, { X86::VAESDECLASTrr, X86::VAESDECLASTrm, TB_ALIGN_16 }, { X86::VAESDECrr, X86::VAESDECrm, TB_ALIGN_16 }, { X86::VAESENCLASTrr, X86::VAESENCLASTrm, TB_ALIGN_16 }, { X86::VAESENCrr, X86::VAESENCrm, TB_ALIGN_16 }, // SHA foldable instructions { X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 }, { X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 }, { X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 }, { X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 }, { X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 }, { X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 }, { X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }, }; for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) { unsigned RegOp = OpTbl2[i].RegOp; unsigned MemOp = OpTbl2[i].MemOp; unsigned Flags = OpTbl2[i].Flags; AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable, RegOp, MemOp, // Index 2, folded load Flags | TB_INDEX_2 | TB_FOLDED_LOAD); } static const X86OpTblEntry OpTbl3[] = { // FMA foldable instructions { X86::VFMADDSSr231r, X86::VFMADDSSr231m, TB_ALIGN_NONE }, { X86::VFMADDSDr231r, X86::VFMADDSDr231m, TB_ALIGN_NONE }, { X86::VFMADDSSr132r, X86::VFMADDSSr132m, TB_ALIGN_NONE }, { X86::VFMADDSDr132r, X86::VFMADDSDr132m, TB_ALIGN_NONE }, { X86::VFMADDSSr213r, X86::VFMADDSSr213m, TB_ALIGN_NONE }, { X86::VFMADDSDr213r, X86::VFMADDSDr213m, TB_ALIGN_NONE }, { X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_NONE }, { X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_NONE }, { X86::VFMADDPSr132r, X86::VFMADDPSr132m, TB_ALIGN_NONE }, { X86::VFMADDPDr132r, X86::VFMADDPDr132m, TB_ALIGN_NONE }, { X86::VFMADDPSr213r, X86::VFMADDPSr213m, TB_ALIGN_NONE }, { X86::VFMADDPDr213r, X86::VFMADDPDr213m, TB_ALIGN_NONE }, { X86::VFMADDPSr231rY, X86::VFMADDPSr231mY, TB_ALIGN_NONE }, { X86::VFMADDPDr231rY, X86::VFMADDPDr231mY, TB_ALIGN_NONE }, { X86::VFMADDPSr132rY, X86::VFMADDPSr132mY, TB_ALIGN_NONE }, { X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_NONE }, { X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_NONE }, { X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, TB_ALIGN_NONE }, { X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, TB_ALIGN_NONE }, { X86::VFNMADDSSr132r, X86::VFNMADDSSr132m, TB_ALIGN_NONE }, { X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, TB_ALIGN_NONE }, { X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, TB_ALIGN_NONE }, { X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, TB_ALIGN_NONE }, { X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_NONE }, { X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_NONE }, { X86::VFNMADDPSr132r, X86::VFNMADDPSr132m, TB_ALIGN_NONE }, { X86::VFNMADDPDr132r, X86::VFNMADDPDr132m, TB_ALIGN_NONE }, { X86::VFNMADDPSr213r, X86::VFNMADDPSr213m, TB_ALIGN_NONE }, { X86::VFNMADDPDr213r, X86::VFNMADDPDr213m, TB_ALIGN_NONE }, { X86::VFNMADDPSr231rY, X86::VFNMADDPSr231mY, TB_ALIGN_NONE }, { X86::VFNMADDPDr231rY, X86::VFNMADDPDr231mY, TB_ALIGN_NONE }, { X86::VFNMADDPSr132rY, X86::VFNMADDPSr132mY, TB_ALIGN_NONE }, { X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_NONE }, { X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_NONE }, { X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_NONE }, { X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, TB_ALIGN_NONE }, { X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, TB_ALIGN_NONE }, { X86::VFMSUBSSr132r, X86::VFMSUBSSr132m, TB_ALIGN_NONE }, { X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, TB_ALIGN_NONE }, { X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, TB_ALIGN_NONE }, { X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, TB_ALIGN_NONE }, { X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_NONE }, { X86::VFMSUBPSr132r, X86::VFMSUBPSr132m, TB_ALIGN_NONE }, { X86::VFMSUBPDr132r, X86::VFMSUBPDr132m, TB_ALIGN_NONE }, { X86::VFMSUBPSr213r, X86::VFMSUBPSr213m, TB_ALIGN_NONE }, { X86::VFMSUBPDr213r, X86::VFMSUBPDr213m, TB_ALIGN_NONE }, { X86::VFMSUBPSr231rY, X86::VFMSUBPSr231mY, TB_ALIGN_NONE }, { X86::VFMSUBPDr231rY, X86::VFMSUBPDr231mY, TB_ALIGN_NONE }, { X86::VFMSUBPSr132rY, X86::VFMSUBPSr132mY, TB_ALIGN_NONE }, { X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_NONE }, { X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_NONE }, { X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_NONE }, { X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, TB_ALIGN_NONE }, { X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, TB_ALIGN_NONE }, { X86::VFNMSUBSSr132r, X86::VFNMSUBSSr132m, TB_ALIGN_NONE }, { X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, TB_ALIGN_NONE }, { X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, TB_ALIGN_NONE }, { X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, TB_ALIGN_NONE }, { X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_NONE }, { X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_NONE }, { X86::VFNMSUBPSr132r, X86::VFNMSUBPSr132m, TB_ALIGN_NONE }, { X86::VFNMSUBPDr132r, X86::VFNMSUBPDr132m, TB_ALIGN_NONE }, { X86::VFNMSUBPSr213r, X86::VFNMSUBPSr213m, TB_ALIGN_NONE }, { X86::VFNMSUBPDr213r, X86::VFNMSUBPDr213m, TB_ALIGN_NONE }, { X86::VFNMSUBPSr231rY, X86::VFNMSUBPSr231mY, TB_ALIGN_NONE }, { X86::VFNMSUBPDr231rY, X86::VFNMSUBPDr231mY, TB_ALIGN_NONE }, { X86::VFNMSUBPSr132rY, X86::VFNMSUBPSr132mY, TB_ALIGN_NONE }, { X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_NONE }, { X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_NONE }, { X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_NONE }, { X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_NONE }, { X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_NONE }, { X86::VFMADDSUBPSr132r, X86::VFMADDSUBPSr132m, TB_ALIGN_NONE }, { X86::VFMADDSUBPDr132r, X86::VFMADDSUBPDr132m, TB_ALIGN_NONE }, { X86::VFMADDSUBPSr213r, X86::VFMADDSUBPSr213m, TB_ALIGN_NONE }, { X86::VFMADDSUBPDr213r, X86::VFMADDSUBPDr213m, TB_ALIGN_NONE }, { X86::VFMADDSUBPSr231rY, X86::VFMADDSUBPSr231mY, TB_ALIGN_NONE }, { X86::VFMADDSUBPDr231rY, X86::VFMADDSUBPDr231mY, TB_ALIGN_NONE }, { X86::VFMADDSUBPSr132rY, X86::VFMADDSUBPSr132mY, TB_ALIGN_NONE }, { X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_NONE }, { X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_NONE }, { X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_NONE }, { X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_NONE }, { X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_NONE }, { X86::VFMSUBADDPSr132r, X86::VFMSUBADDPSr132m, TB_ALIGN_NONE }, { X86::VFMSUBADDPDr132r, X86::VFMSUBADDPDr132m, TB_ALIGN_NONE }, { X86::VFMSUBADDPSr213r, X86::VFMSUBADDPSr213m, TB_ALIGN_NONE }, { X86::VFMSUBADDPDr213r, X86::VFMSUBADDPDr213m, TB_ALIGN_NONE }, { X86::VFMSUBADDPSr231rY, X86::VFMSUBADDPSr231mY, TB_ALIGN_NONE }, { X86::VFMSUBADDPDr231rY, X86::VFMSUBADDPDr231mY, TB_ALIGN_NONE }, { X86::VFMSUBADDPSr132rY, X86::VFMSUBADDPSr132mY, TB_ALIGN_NONE }, { X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_NONE }, { X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_NONE }, { X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_NONE }, // FMA4 foldable patterns { X86::VFMADDSS4rr, X86::VFMADDSS4rm, 0 }, { X86::VFMADDSD4rr, X86::VFMADDSD4rm, 0 }, { X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_16 }, { X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_16 }, { X86::VFMADDPS4rrY, X86::VFMADDPS4rmY, TB_ALIGN_32 }, { X86::VFMADDPD4rrY, X86::VFMADDPD4rmY, TB_ALIGN_32 }, { X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, 0 }, { X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, 0 }, { X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_16 }, { X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_16 }, { X86::VFNMADDPS4rrY, X86::VFNMADDPS4rmY, TB_ALIGN_32 }, { X86::VFNMADDPD4rrY, X86::VFNMADDPD4rmY, TB_ALIGN_32 }, { X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, 0 }, { X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, 0 }, { X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_16 }, { X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_16 }, { X86::VFMSUBPS4rrY, X86::VFMSUBPS4rmY, TB_ALIGN_32 }, { X86::VFMSUBPD4rrY, X86::VFMSUBPD4rmY, TB_ALIGN_32 }, { X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, 0 }, { X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, 0 }, { X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_16 }, { X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_16 }, { X86::VFNMSUBPS4rrY, X86::VFNMSUBPS4rmY, TB_ALIGN_32 }, { X86::VFNMSUBPD4rrY, X86::VFNMSUBPD4rmY, TB_ALIGN_32 }, { X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_16 }, { X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_16 }, { X86::VFMADDSUBPS4rrY, X86::VFMADDSUBPS4rmY, TB_ALIGN_32 }, { X86::VFMADDSUBPD4rrY, X86::VFMADDSUBPD4rmY, TB_ALIGN_32 }, { X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_16 }, { X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_16 }, { X86::VFMSUBADDPS4rrY, X86::VFMSUBADDPS4rmY, TB_ALIGN_32 }, { X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4rmY, TB_ALIGN_32 }, // AVX-512 VPERMI instructions with 3 source operands. { X86::VPERMI2Drr, X86::VPERMI2Drm, 0 }, { X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 }, { X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 }, { X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 }, { X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 }, { X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 }, { X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 }, { X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 }, { X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE }, { X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE }, { X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE }, { X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE }, { X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE }, // AVX-512 arithmetic instructions { X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 }, { X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 }, { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, { X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 }, { X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 }, { X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 }, { X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 }, { X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 }, { X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 }, { X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 }, { X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 }, // AVX-512{F,VL} arithmetic instructions 256-bit { X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 }, { X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 }, { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, { X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 }, { X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 }, { X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 }, { X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 }, { X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 }, { X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 }, { X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 }, { X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 }, // AVX-512{F,VL} arithmetic instructions 128-bit { X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 }, { X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 }, { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, { X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 }, { X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 }, { X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 }, { X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 }, { X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 }, { X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 }, { X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 }, { X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 } }; for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) { unsigned RegOp = OpTbl3[i].RegOp; unsigned MemOp = OpTbl3[i].MemOp; unsigned Flags = OpTbl3[i].Flags; AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable, RegOp, MemOp, // Index 3, folded load Flags | TB_INDEX_3 | TB_FOLDED_LOAD); } static const X86OpTblEntry OpTbl4[] = { // AVX-512 foldable instructions { X86::VADDPSZrrk, X86::VADDPSZrmk, 0 }, { X86::VADDPDZrrk, X86::VADDPDZrmk, 0 }, { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, { X86::VMULPSZrrk, X86::VMULPSZrmk, 0 }, { X86::VMULPDZrrk, X86::VMULPDZrmk, 0 }, { X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 }, { X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 }, { X86::VMINPSZrrk, X86::VMINPSZrmk, 0 }, { X86::VMINPDZrrk, X86::VMINPDZrmk, 0 }, { X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 }, { X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 }, // AVX-512{F,VL} foldable instructions 256-bit { X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 }, { X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 }, { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, { X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 }, { X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 }, { X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 }, { X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 }, { X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 }, { X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 }, { X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 }, { X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 }, // AVX-512{F,VL} foldable instructions 128-bit { X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 }, { X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 }, { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, { X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 }, { X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 }, { X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 }, { X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 }, { X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 }, { X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 }, { X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 }, { X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 } }; for (unsigned i = 0, e = array_lengthof(OpTbl4); i != e; ++i) { unsigned RegOp = OpTbl4[i].RegOp; unsigned MemOp = OpTbl4[i].MemOp; unsigned Flags = OpTbl4[i].Flags; AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable, RegOp, MemOp, // Index 4, folded load Flags | TB_INDEX_4 | TB_FOLDED_LOAD); } } void X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable, MemOp2RegOpTableType &M2RTable, unsigned RegOp, unsigned MemOp, unsigned Flags) { if ((Flags & TB_NO_FORWARD) == 0) { assert(!R2MTable.count(RegOp) && "Duplicate entry!"); R2MTable[RegOp] = std::make_pair(MemOp, Flags); } if ((Flags & TB_NO_REVERSE) == 0) { assert(!M2RTable.count(MemOp) && "Duplicated entries in unfolding maps?"); M2RTable[MemOp] = std::make_pair(RegOp, Flags); } } bool X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const { switch (MI.getOpcode()) { default: break; case X86::MOVSX16rr8: case X86::MOVZX16rr8: case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: if (!Subtarget.is64Bit()) // It's not always legal to reference the low 8-bit of the larger // register in 32-bit mode. return false; case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: case X86::MOVSX64rr32: { if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg()) // Be conservative. return false; SrcReg = MI.getOperand(1).getReg(); DstReg = MI.getOperand(0).getReg(); switch (MI.getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::MOVSX16rr8: case X86::MOVZX16rr8: case X86::MOVSX32rr8: case X86::MOVZX32rr8: case X86::MOVSX64rr8: SubIdx = X86::sub_8bit; break; case X86::MOVSX32rr16: case X86::MOVZX32rr16: case X86::MOVSX64rr16: SubIdx = X86::sub_16bit; break; case X86::MOVSX64rr32: SubIdx = X86::sub_32bit; break; } return true; } } return false; } +int X86InstrInfo::getSPAdjust(const MachineInstr *MI) const { + const MachineFunction *MF = MI->getParent()->getParent(); + const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); + + if (MI->getOpcode() == getCallFrameSetupOpcode() || + MI->getOpcode() == getCallFrameDestroyOpcode()) { + unsigned StackAlign = TFI->getStackAlignment(); + int SPAdj = (MI->getOperand(0).getImm() + StackAlign - 1) / StackAlign * + StackAlign; + + SPAdj -= MI->getOperand(1).getImm(); + + if (MI->getOpcode() == getCallFrameSetupOpcode()) + return SPAdj; + else + return -SPAdj; + } + + // To know whether a call adjusts the stack, we need information + // that is bound to the following ADJCALLSTACKUP pseudo. + // Look for the next ADJCALLSTACKUP that follows the call. + if (MI->isCall()) { + const MachineBasicBlock* MBB = MI->getParent(); + auto I = ++MachineBasicBlock::const_iterator(MI); + for (auto E = MBB->end(); I != E; ++I) { + if (I->getOpcode() == getCallFrameDestroyOpcode() || + I->isCall()) + break; + } + + // If we could not find a frame destroy opcode, then it has already + // been simplified, so we don't care. + if (I->getOpcode() != getCallFrameDestroyOpcode()) + return 0; + + return -(I->getOperand(1).getImm()); + } + + // Currently handle only PUSHes we can reasonably expect to see + // in call sequences + switch (MI->getOpcode()) { + default: + return 0; + case X86::PUSH32i8: + case X86::PUSH32r: + case X86::PUSH32rmm: + case X86::PUSH32rmr: + case X86::PUSHi32: + return 4; + } +} + /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op, int &FrameIndex) const { if (MI->getOperand(Op+X86::AddrBaseReg).isFI() && MI->getOperand(Op+X86::AddrScaleAmt).isImm() && MI->getOperand(Op+X86::AddrIndexReg).isReg() && MI->getOperand(Op+X86::AddrDisp).isImm() && MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 && MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 && MI->getOperand(Op+X86::AddrDisp).getImm() == 0) { FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex(); return true; } return false; } static bool isFrameLoadOpcode(int Opcode) { switch (Opcode) { default: return false; case X86::MOV8rm: case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: case X86::LD_Fp64m: case X86::MOVSSrm: case X86::MOVSDrm: case X86::MOVAPSrm: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::VMOVSSrm: case X86::VMOVSDrm: case X86::VMOVAPSrm: case X86::VMOVAPDrm: case X86::VMOVDQArm: case X86::VMOVUPSYrm: case X86::VMOVAPSYrm: case X86::VMOVUPDYrm: case X86::VMOVAPDYrm: case X86::VMOVDQUYrm: case X86::VMOVDQAYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::VMOVAPSZrm: case X86::VMOVUPSZrm: return true; } } static bool isFrameStoreOpcode(int Opcode) { switch (Opcode) { default: break; case X86::MOV8mr: case X86::MOV16mr: case X86::MOV32mr: case X86::MOV64mr: case X86::ST_FpP64m: case X86::MOVSSmr: case X86::MOVSDmr: case X86::MOVAPSmr: case X86::MOVAPDmr: case X86::MOVDQAmr: case X86::VMOVSSmr: case X86::VMOVSDmr: case X86::VMOVAPSmr: case X86::VMOVAPDmr: case X86::VMOVDQAmr: case X86::VMOVUPSYmr: case X86::VMOVAPSYmr: case X86::VMOVUPDYmr: case X86::VMOVAPDYmr: case X86::VMOVDQUYmr: case X86::VMOVDQAYmr: case X86::VMOVUPSZmr: case X86::VMOVAPSZmr: case X86::MMX_MOVD64mr: case X86::MMX_MOVQ64mr: case X86::MMX_MOVNTQmr: return true; } return false; } unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const { if (isFrameLoadOpcode(MI->getOpcode())) if (MI->getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex)) return MI->getOperand(0).getReg(); return 0; } unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, int &FrameIndex) const { if (isFrameLoadOpcode(MI->getOpcode())) { unsigned Reg; if ((Reg = isLoadFromStackSlot(MI, FrameIndex))) return Reg; // Check for post-frame index elimination operations const MachineMemOperand *Dummy; return hasLoadFromStackSlot(MI, Dummy, FrameIndex); } return 0; } unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const { if (isFrameStoreOpcode(MI->getOpcode())) if (MI->getOperand(X86::AddrNumOperands).getSubReg() == 0 && isFrameOperand(MI, 0, FrameIndex)) return MI->getOperand(X86::AddrNumOperands).getReg(); return 0; } unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI, int &FrameIndex) const { if (isFrameStoreOpcode(MI->getOpcode())) { unsigned Reg; if ((Reg = isStoreToStackSlot(MI, FrameIndex))) return Reg; // Check for post-frame index elimination operations const MachineMemOperand *Dummy; return hasStoreToStackSlot(MI, Dummy, FrameIndex); } return 0; } /// regIsPICBase - Return true if register is PIC base (i.e.g defined by /// X86::MOVPC32r. static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) { // Don't waste compile time scanning use-def chains of physregs. if (!TargetRegisterInfo::isVirtualRegister(BaseReg)) return false; bool isPICBase = false; for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg), E = MRI.def_instr_end(); I != E; ++I) { MachineInstr *DefMI = &*I; if (DefMI->getOpcode() != X86::MOVPC32r) return false; assert(!isPICBase && "More than one PIC base?"); isPICBase = true; } return isPICBase; } bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, AliasAnalysis *AA) const { switch (MI->getOpcode()) { default: break; case X86::MOV8rm: case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: case X86::LD_Fp64m: case X86::MOVSSrm: case X86::MOVSDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: case X86::VMOVSSrm: case X86::VMOVSDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::FsVMOVAPSrm: case X86::FsVMOVAPDrm: case X86::FsMOVAPSrm: case X86::FsMOVAPDrm: { // Loads from constant pools are trivially rematerializable. if (MI->getOperand(1+X86::AddrBaseReg).isReg() && MI->getOperand(1+X86::AddrScaleAmt).isImm() && MI->getOperand(1+X86::AddrIndexReg).isReg() && MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 && MI->isInvariantLoad(AA)) { unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg(); if (BaseReg == 0 || BaseReg == X86::RIP) return true; // Allow re-materialization of PIC load. if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal()) return false; const MachineFunction &MF = *MI->getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); return regIsPICBase(BaseReg, MRI); } return false; } case X86::LEA32r: case X86::LEA64r: { if (MI->getOperand(1+X86::AddrScaleAmt).isImm() && MI->getOperand(1+X86::AddrIndexReg).isReg() && MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 && !MI->getOperand(1+X86::AddrDisp).isReg()) { // lea fi#, lea GV, etc. are all rematerializable. if (!MI->getOperand(1+X86::AddrBaseReg).isReg()) return true; unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg(); if (BaseReg == 0) return true; // Allow re-materialization of lea PICBase + x. const MachineFunction &MF = *MI->getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); return regIsPICBase(BaseReg, MRI); } return false; } } // All other instructions marked M_REMATERIALIZABLE are always trivially // rematerializable. return true; } bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { MachineBasicBlock::iterator E = MBB.end(); // For compile time consideration, if we are not able to determine the // safety after visiting 4 instructions in each direction, we will assume // it's not safe. MachineBasicBlock::iterator Iter = I; for (unsigned i = 0; Iter != E && i < 4; ++i) { bool SeenDef = false; for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { MachineOperand &MO = Iter->getOperand(j); if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) SeenDef = true; if (!MO.isReg()) continue; if (MO.getReg() == X86::EFLAGS) { if (MO.isUse()) return false; SeenDef = true; } } if (SeenDef) // This instruction defines EFLAGS, no need to look any further. return true; ++Iter; // Skip over DBG_VALUE. while (Iter != E && Iter->isDebugValue()) ++Iter; } // It is safe to clobber EFLAGS at the end of a block of no successor has it // live in. if (Iter == E) { for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(), SE = MBB.succ_end(); SI != SE; ++SI) if ((*SI)->isLiveIn(X86::EFLAGS)) return false; return true; } MachineBasicBlock::iterator B = MBB.begin(); Iter = I; for (unsigned i = 0; i < 4; ++i) { // If we make it to the beginning of the block, it's safe to clobber // EFLAGS iff EFLAGS is not live-in. if (Iter == B) return !MBB.isLiveIn(X86::EFLAGS); --Iter; // Skip over DBG_VALUE. while (Iter != B && Iter->isDebugValue()) --Iter; bool SawKill = false; for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) { MachineOperand &MO = Iter->getOperand(j); // A register mask may clobber EFLAGS, but we should still look for a // live EFLAGS def. if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS)) SawKill = true; if (MO.isReg() && MO.getReg() == X86::EFLAGS) { if (MO.isDef()) return MO.isDead(); if (MO.isKill()) SawKill = true; } } if (SawKill) // This instruction kills EFLAGS and doesn't redefine it, so // there's no need to look further. return true; } // Conservative answer. return false; } void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, const TargetRegisterInfo &TRI) const { // MOV32r0 is implemented with a xor which clobbers condition code. // Re-materialize it as movri instructions to avoid side effects. unsigned Opc = Orig->getOpcode(); if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) { DebugLoc DL = Orig->getDebugLoc(); BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0)) .addImm(0); } else { MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig); MBB.insert(I, MI); } MachineInstr *NewMI = std::prev(I); NewMI->substituteRegister(Orig->getOperand(0).getReg(), DestReg, SubIdx, TRI); } /// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that /// is not marked dead. static bool hasLiveCondCodeDef(MachineInstr *MI) { for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS && !MO.isDead()) { return true; } } return false; } /// getTruncatedShiftCount - check whether the shift count for a machine operand /// is non-zero. inline static unsigned getTruncatedShiftCount(MachineInstr *MI, unsigned ShiftAmtOperandIdx) { // The shift count is six bits with the REX.W prefix and five bits without. unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31; unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm(); return Imm & ShiftCountMask; } /// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate /// can be represented by a LEA instruction. inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) { // Left shift instructions can be transformed into load-effective-address // instructions if we can encode them appropriately. // A LEA instruction utilizes a SIB byte to encode it's scale factor. // The SIB.scale field is two bits wide which means that we can encode any // shift amount less than 4. return ShAmt < 4 && ShAmt > 0; } bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, unsigned Opc, bool AllowSP, unsigned &NewSrc, bool &isKill, bool &isUndef, MachineOperand &ImplicitOp) const { MachineFunction &MF = *MI->getParent()->getParent(); const TargetRegisterClass *RC; if (AllowSP) { RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass; } else { RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass; } unsigned SrcReg = Src.getReg(); // For both LEA64 and LEA32 the register already has essentially the right // type (32-bit or 64-bit) we may just need to forbid SP. if (Opc != X86::LEA64_32r) { NewSrc = SrcReg; isKill = Src.isKill(); isUndef = Src.isUndef(); if (TargetRegisterInfo::isVirtualRegister(NewSrc) && !MF.getRegInfo().constrainRegClass(NewSrc, RC)) return false; return true; } // This is for an LEA64_32r and incoming registers are 32-bit. One way or // another we need to add 64-bit registers to the final MI. if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) { ImplicitOp = Src; ImplicitOp.setImplicit(); NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64); MachineBasicBlock::LivenessQueryResult LQR = MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI); switch (LQR) { case MachineBasicBlock::LQR_Unknown: // We can't give sane liveness flags to the instruction, abandon LEA // formation. return false; case MachineBasicBlock::LQR_Live: isKill = MI->killsRegister(SrcReg); isUndef = false; break; default: // The physreg itself is dead, so we have to use it as an . isKill = false; isUndef = true; break; } } else { // Virtual register of the wrong class, we have to create a temporary 64-bit // vreg to feed into the LEA. NewSrc = MF.getRegInfo().createVirtualRegister(RC); BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(TargetOpcode::COPY)) .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit) .addOperand(Src); // Which is obviously going to be dead after we're done with it. isKill = true; isUndef = false; } // We've set all the parameters without issue. return true; } /// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when /// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting /// to a 32-bit superregister and then truncating back down to a 16-bit /// subregister. MachineInstr * X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const { MachineInstr *MI = MBBI; unsigned Dest = MI->getOperand(0).getReg(); unsigned Src = MI->getOperand(1).getReg(); bool isDead = MI->getOperand(0).isDead(); bool isKill = MI->getOperand(1).isKill(); MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo(); unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass); unsigned Opc, leaInReg; if (Subtarget.is64Bit()) { Opc = X86::LEA64_32r; leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); } else { Opc = X86::LEA32r; leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); } // Build and insert into an implicit UNDEF value. This is OK because // well be shifting and then extracting the lower 16-bits. // This has the potential to cause partial register stall. e.g. // movw (%rbp,%rcx,2), %dx // leal -65(%rdx), %esi // But testing has shown this *does* help performance in 64-bit mode (at // least on modern x86 machines). BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg); MachineInstr *InsMI = BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY)) .addReg(leaInReg, RegState::Define, X86::sub_16bit) .addReg(Src, getKillRegState(isKill)); MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(Opc), leaOutReg); switch (MIOpc) { default: llvm_unreachable("Unreachable!"); case X86::SHL16ri: { unsigned ShAmt = MI->getOperand(2).getImm(); MIB.addReg(0).addImm(1 << ShAmt) .addReg(leaInReg, RegState::Kill).addImm(0).addReg(0); break; } case X86::INC16r: addRegOffset(MIB, leaInReg, true, 1); break; case X86::DEC16r: addRegOffset(MIB, leaInReg, true, -1); break; case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri_DB: case X86::ADD16ri8_DB: addRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm()); break; case X86::ADD16rr: case X86::ADD16rr_DB: { unsigned Src2 = MI->getOperand(2).getReg(); bool isKill2 = MI->getOperand(2).isKill(); unsigned leaInReg2 = 0; MachineInstr *InsMI2 = nullptr; if (Src == Src2) { // ADD16rr %reg1028, %reg1028 // just a single insert_subreg. addRegReg(MIB, leaInReg, true, leaInReg, false); } else { if (Subtarget.is64Bit()) leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass); else leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); // Build and insert into an implicit UNDEF value. This is OK because // well be shifting and then extracting the lower 16-bits. BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2); InsMI2 = BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(TargetOpcode::COPY)) .addReg(leaInReg2, RegState::Define, X86::sub_16bit) .addReg(Src2, getKillRegState(isKill2)); addRegReg(MIB, leaInReg, true, leaInReg2, true); } if (LV && isKill2 && InsMI2) LV->replaceKillInstruction(Src2, MI, InsMI2); break; } } MachineInstr *NewMI = MIB; MachineInstr *ExtMI = BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(TargetOpcode::COPY)) .addReg(Dest, RegState::Define | getDeadRegState(isDead)) .addReg(leaOutReg, RegState::Kill, X86::sub_16bit); if (LV) { // Update live variables LV->getVarInfo(leaInReg).Kills.push_back(NewMI); LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI); if (isKill) LV->replaceKillInstruction(Src, MI, InsMI); if (isDead) LV->replaceKillInstruction(Dest, MI, ExtMI); } return ExtMI; } /// convertToThreeAddress - This method must be implemented by targets that /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target /// may be able to convert a two-address instruction into a true /// three-address instruction on demand. This allows the X86 target (for /// example) to convert ADD and SHL instructions into LEA instructions if they /// would require register copies due to two-addressness. /// /// This method returns a null pointer if the transformation cannot be /// performed, otherwise it returns the new instruction. /// MachineInstr * X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const { MachineInstr *MI = MBBI; // The following opcodes also sets the condition code register(s). Only // convert them to equivalent lea if the condition code register def's // are dead! if (hasLiveCondCodeDef(MI)) return nullptr; MachineFunction &MF = *MI->getParent()->getParent(); // All instructions input are two-addr instructions. Get the known operands. const MachineOperand &Dest = MI->getOperand(0); const MachineOperand &Src = MI->getOperand(1); MachineInstr *NewMI = nullptr; // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When // we have better subtarget support, enable the 16-bit LEA generation here. // 16-bit LEA is also slow on Core2. bool DisableLEA16 = true; bool is64Bit = Subtarget.is64Bit(); unsigned MIOpc = MI->getOpcode(); switch (MIOpc) { default: return nullptr; case X86::SHL64ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; // LEA can't handle RSP. if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) && !MF.getRegInfo().constrainRegClass(Src.getReg(), &X86::GR64_NOSPRegClass)) return nullptr; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) .addOperand(Dest) .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); break; } case X86::SHL32ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; // LEA can't handle ESP. bool isKill, isUndef; unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, isUndef, ImplicitOp)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) .addReg(0).addImm(1 << ShAmt) .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)) .addImm(0).addReg(0); if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); NewMI = MIB; break; } case X86::SHL16ri: { assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!"); unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr; if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr; NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest) .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0); break; } case X86::INC64r: case X86::INC32r: { assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r : (is64Bit ? X86::LEA64_32r : X86::LEA32r); bool isKill, isUndef; unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, isUndef, ImplicitOp)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef)); if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); NewMI = addOffset(MIB, 1); break; } case X86::INC16r: if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr; assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!"); NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest).addOperand(Src), 1); break; case X86::DEC64r: case X86::DEC32r: { assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r : (is64Bit ? X86::LEA64_32r : X86::LEA32r); bool isKill, isUndef; unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill, isUndef, ImplicitOp)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); NewMI = addOffset(MIB, -1); break; } case X86::DEC16r: if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr; assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!"); NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest).addOperand(Src), -1); break; case X86::ADD64rr: case X86::ADD64rr_DB: case X86::ADD32rr: case X86::ADD32rr_DB: { assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc; if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) Opc = X86::LEA64r; else Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill, isUndef; unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, isUndef, ImplicitOp)) return nullptr; const MachineOperand &Src2 = MI->getOperand(2); bool isKill2, isUndef2; unsigned SrcReg2; MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false, SrcReg2, isKill2, isUndef2, ImplicitOp2)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest); if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); if (ImplicitOp2.getReg() != 0) MIB.addOperand(ImplicitOp2); NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2); // Preserve undefness of the operands. NewMI->getOperand(1).setIsUndef(isUndef); NewMI->getOperand(3).setIsUndef(isUndef2); if (LV && Src2.isKill()) LV->replaceKillInstruction(SrcReg2, MI, NewMI); break; } case X86::ADD16rr: case X86::ADD16rr_DB: { if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr; assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Src2 = MI->getOperand(2).getReg(); bool isKill2 = MI->getOperand(2).isKill(); NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest), Src.getReg(), Src.isKill(), Src2, isKill2); // Preserve undefness of the operands. bool isUndef = MI->getOperand(1).isUndef(); bool isUndef2 = MI->getOperand(2).isUndef(); NewMI->getOperand(1).setIsUndef(isUndef); NewMI->getOperand(3).setIsUndef(isUndef2); if (LV && isKill2) LV->replaceKillInstruction(Src2, MI, NewMI); break; } case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD64ri32_DB: case X86::ADD64ri8_DB: assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r)) .addOperand(Dest).addOperand(Src), MI->getOperand(2).getImm()); break; case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD32ri_DB: case X86::ADD32ri8_DB: { assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r; bool isKill, isUndef; unsigned SrcReg; MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false); if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true, SrcReg, isKill, isUndef, ImplicitOp)) return nullptr; MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc)) .addOperand(Dest) .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill)); if (ImplicitOp.getReg() != 0) MIB.addOperand(ImplicitOp); NewMI = addOffset(MIB, MI->getOperand(2).getImm()); break; } case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri_DB: case X86::ADD16ri8_DB: if (DisableLEA16) return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr; assert(MI->getNumOperands() >= 3 && "Unknown add instruction!"); NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r)) .addOperand(Dest).addOperand(Src), MI->getOperand(2).getImm()); break; } if (!NewMI) return nullptr; if (LV) { // Update live variables if (Src.isKill()) LV->replaceKillInstruction(Src.getReg(), MI, NewMI); if (Dest.isDead()) LV->replaceKillInstruction(Dest.getReg(), MI, NewMI); } MFI->insert(MBBI, NewMI); // Insert the new inst return NewMI; } /// commuteInstruction - We have a few instructions that must be hacked on to /// commute them. /// MachineInstr * X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const { switch (MI->getOpcode()) { case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I) case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I) case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I) case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I) case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I) case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I) unsigned Opc; unsigned Size; switch (MI->getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break; case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break; case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break; case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break; case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break; case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break; } unsigned Amt = MI->getOperand(3).getImm(); if (NewMI) { MachineFunction &MF = *MI->getParent()->getParent(); MI = MF.CloneMachineInstr(MI); NewMI = false; } MI->setDesc(get(Opc)); MI->getOperand(3).setImm(Size-Amt); return TargetInstrInfo::commuteInstruction(MI, NewMI); } case X86::BLENDPDrri: case X86::BLENDPSrri: case X86::PBLENDWrri: case X86::VBLENDPDrri: case X86::VBLENDPSrri: case X86::VBLENDPDYrri: case X86::VBLENDPSYrri: case X86::VPBLENDDrri: case X86::VPBLENDWrri: case X86::VPBLENDDYrri: case X86::VPBLENDWYrri:{ unsigned Mask; switch (MI->getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::BLENDPDrri: Mask = 0x03; break; case X86::BLENDPSrri: Mask = 0x0F; break; case X86::PBLENDWrri: Mask = 0xFF; break; case X86::VBLENDPDrri: Mask = 0x03; break; case X86::VBLENDPSrri: Mask = 0x0F; break; case X86::VBLENDPDYrri: Mask = 0x0F; break; case X86::VBLENDPSYrri: Mask = 0xFF; break; case X86::VPBLENDDrri: Mask = 0x0F; break; case X86::VPBLENDWrri: Mask = 0xFF; break; case X86::VPBLENDDYrri: Mask = 0xFF; break; case X86::VPBLENDWYrri: Mask = 0xFF; break; } // Only the least significant bits of Imm are used. unsigned Imm = MI->getOperand(3).getImm() & Mask; if (NewMI) { MachineFunction &MF = *MI->getParent()->getParent(); MI = MF.CloneMachineInstr(MI); NewMI = false; } MI->getOperand(3).setImm(Mask ^ Imm); return TargetInstrInfo::commuteInstruction(MI, NewMI); } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr: case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr: case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr: case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr: case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr: case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr: case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr: case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr: case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr: case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr: case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr: case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr: case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: { unsigned Opc; switch (MI->getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break; case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break; case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break; case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break; case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break; case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break; case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break; case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break; case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break; case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break; case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break; case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break; case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break; case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break; case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break; case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break; case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break; case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break; case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break; case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break; case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break; case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break; case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break; case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break; case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break; case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break; case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break; case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break; case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break; case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break; case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break; case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break; case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break; case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break; case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break; case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break; case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break; case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break; case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break; case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break; case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break; case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break; case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break; case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break; case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break; case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break; case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break; case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break; } if (NewMI) { MachineFunction &MF = *MI->getParent()->getParent(); MI = MF.CloneMachineInstr(MI); NewMI = false; } MI->setDesc(get(Opc)); // Fallthrough intended. } default: return TargetInstrInfo::commuteInstruction(MI, NewMI); } } bool X86InstrInfo::findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const { switch (MI->getOpcode()) { case X86::BLENDPDrri: case X86::BLENDPSrri: case X86::PBLENDWrri: case X86::VBLENDPDrri: case X86::VBLENDPSrri: case X86::VBLENDPDYrri: case X86::VBLENDPSYrri: case X86::VPBLENDDrri: case X86::VPBLENDDYrri: case X86::VPBLENDWrri: case X86::VPBLENDWYrri: SrcOpIdx1 = 1; SrcOpIdx2 = 2; return true; case X86::VFMADDPDr231r: case X86::VFMADDPSr231r: case X86::VFMADDSDr231r: case X86::VFMADDSSr231r: case X86::VFMSUBPDr231r: case X86::VFMSUBPSr231r: case X86::VFMSUBSDr231r: case X86::VFMSUBSSr231r: case X86::VFNMADDPDr231r: case X86::VFNMADDPSr231r: case X86::VFNMADDSDr231r: case X86::VFNMADDSSr231r: case X86::VFNMSUBPDr231r: case X86::VFNMSUBPSr231r: case X86::VFNMSUBSDr231r: case X86::VFNMSUBSSr231r: case X86::VFMADDPDr231rY: case X86::VFMADDPSr231rY: case X86::VFMSUBPDr231rY: case X86::VFMSUBPSr231rY: case X86::VFNMADDPDr231rY: case X86::VFNMADDPSr231rY: case X86::VFNMSUBPDr231rY: case X86::VFNMSUBPSr231rY: SrcOpIdx1 = 2; SrcOpIdx2 = 3; return true; default: return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); } } static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { switch (BrOpc) { default: return X86::COND_INVALID; case X86::JE_1: return X86::COND_E; case X86::JNE_1: return X86::COND_NE; case X86::JL_1: return X86::COND_L; case X86::JLE_1: return X86::COND_LE; case X86::JG_1: return X86::COND_G; case X86::JGE_1: return X86::COND_GE; case X86::JB_1: return X86::COND_B; case X86::JBE_1: return X86::COND_BE; case X86::JA_1: return X86::COND_A; case X86::JAE_1: return X86::COND_AE; case X86::JS_1: return X86::COND_S; case X86::JNS_1: return X86::COND_NS; case X86::JP_1: return X86::COND_P; case X86::JNP_1: return X86::COND_NP; case X86::JO_1: return X86::COND_O; case X86::JNO_1: return X86::COND_NO; } } /// getCondFromSETOpc - return condition code of a SET opcode. static X86::CondCode getCondFromSETOpc(unsigned Opc) { switch (Opc) { default: return X86::COND_INVALID; case X86::SETAr: case X86::SETAm: return X86::COND_A; case X86::SETAEr: case X86::SETAEm: return X86::COND_AE; case X86::SETBr: case X86::SETBm: return X86::COND_B; case X86::SETBEr: case X86::SETBEm: return X86::COND_BE; case X86::SETEr: case X86::SETEm: return X86::COND_E; case X86::SETGr: case X86::SETGm: return X86::COND_G; case X86::SETGEr: case X86::SETGEm: return X86::COND_GE; case X86::SETLr: case X86::SETLm: return X86::COND_L; case X86::SETLEr: case X86::SETLEm: return X86::COND_LE; case X86::SETNEr: case X86::SETNEm: return X86::COND_NE; case X86::SETNOr: case X86::SETNOm: return X86::COND_NO; case X86::SETNPr: case X86::SETNPm: return X86::COND_NP; case X86::SETNSr: case X86::SETNSm: return X86::COND_NS; case X86::SETOr: case X86::SETOm: return X86::COND_O; case X86::SETPr: case X86::SETPm: return X86::COND_P; case X86::SETSr: case X86::SETSm: return X86::COND_S; } } /// getCondFromCmovOpc - return condition code of a CMov opcode. X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) { switch (Opc) { default: return X86::COND_INVALID; case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm: case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr: return X86::COND_A; case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm: case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr: return X86::COND_AE; case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm: case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr: return X86::COND_B; case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm: case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr: return X86::COND_BE; case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm: case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr: return X86::COND_E; case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm: case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr: return X86::COND_G; case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm: case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr: return X86::COND_GE; case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm: case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr: return X86::COND_L; case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm: case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr: return X86::COND_LE; case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm: case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr: return X86::COND_NE; case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm: case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr: return X86::COND_NO; case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm: case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr: return X86::COND_NP; case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm: case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr: return X86::COND_NS; case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm: case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr: return X86::COND_O; case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm: case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr: return X86::COND_P; case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm: case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr: return X86::COND_S; } } unsigned X86::GetCondBranchFromCond(X86::CondCode CC) { switch (CC) { default: llvm_unreachable("Illegal condition code!"); case X86::COND_E: return X86::JE_1; case X86::COND_NE: return X86::JNE_1; case X86::COND_L: return X86::JL_1; case X86::COND_LE: return X86::JLE_1; case X86::COND_G: return X86::JG_1; case X86::COND_GE: return X86::JGE_1; case X86::COND_B: return X86::JB_1; case X86::COND_BE: return X86::JBE_1; case X86::COND_A: return X86::JA_1; case X86::COND_AE: return X86::JAE_1; case X86::COND_S: return X86::JS_1; case X86::COND_NS: return X86::JNS_1; case X86::COND_P: return X86::JP_1; case X86::COND_NP: return X86::JNP_1; case X86::COND_O: return X86::JO_1; case X86::COND_NO: return X86::JNO_1; } } /// GetOppositeBranchCondition - Return the inverse of the specified condition, /// e.g. turning COND_E to COND_NE. X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) { switch (CC) { default: llvm_unreachable("Illegal condition code!"); case X86::COND_E: return X86::COND_NE; case X86::COND_NE: return X86::COND_E; case X86::COND_L: return X86::COND_GE; case X86::COND_LE: return X86::COND_G; case X86::COND_G: return X86::COND_LE; case X86::COND_GE: return X86::COND_L; case X86::COND_B: return X86::COND_AE; case X86::COND_BE: return X86::COND_A; case X86::COND_A: return X86::COND_BE; case X86::COND_AE: return X86::COND_B; case X86::COND_S: return X86::COND_NS; case X86::COND_NS: return X86::COND_S; case X86::COND_P: return X86::COND_NP; case X86::COND_NP: return X86::COND_P; case X86::COND_O: return X86::COND_NO; case X86::COND_NO: return X86::COND_O; } } /// getSwappedCondition - assume the flags are set by MI(a,b), return /// the condition code if we modify the instructions such that flags are /// set by MI(b,a). static X86::CondCode getSwappedCondition(X86::CondCode CC) { switch (CC) { default: return X86::COND_INVALID; case X86::COND_E: return X86::COND_E; case X86::COND_NE: return X86::COND_NE; case X86::COND_L: return X86::COND_G; case X86::COND_LE: return X86::COND_GE; case X86::COND_G: return X86::COND_L; case X86::COND_GE: return X86::COND_LE; case X86::COND_B: return X86::COND_A; case X86::COND_BE: return X86::COND_AE; case X86::COND_A: return X86::COND_B; case X86::COND_AE: return X86::COND_BE; } } /// getSETFromCond - Return a set opcode for the given condition and /// whether it has memory operand. unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { static const uint16_t Opc[16][2] = { { X86::SETAr, X86::SETAm }, { X86::SETAEr, X86::SETAEm }, { X86::SETBr, X86::SETBm }, { X86::SETBEr, X86::SETBEm }, { X86::SETEr, X86::SETEm }, { X86::SETGr, X86::SETGm }, { X86::SETGEr, X86::SETGEm }, { X86::SETLr, X86::SETLm }, { X86::SETLEr, X86::SETLEm }, { X86::SETNEr, X86::SETNEm }, { X86::SETNOr, X86::SETNOm }, { X86::SETNPr, X86::SETNPm }, { X86::SETNSr, X86::SETNSm }, { X86::SETOr, X86::SETOm }, { X86::SETPr, X86::SETPm }, { X86::SETSr, X86::SETSm } }; assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes"); return Opc[CC][HasMemoryOperand ? 1 : 0]; } /// getCMovFromCond - Return a cmov opcode for the given condition, /// register size in bytes, and operand type. unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes, bool HasMemoryOperand) { static const uint16_t Opc[32][3] = { { X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr }, { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr }, { X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr }, { X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr }, { X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr }, { X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr }, { X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr }, { X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr }, { X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr }, { X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr }, { X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr }, { X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr }, { X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr }, { X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr }, { X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr }, { X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr }, { X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm }, { X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm }, { X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm }, { X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm }, { X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm }, { X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm }, { X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm }, { X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm }, { X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm }, { X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm }, { X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm }, { X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm }, { X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm }, { X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm }, { X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm }, { X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm } }; assert(CC < 16 && "Can only handle standard cond codes"); unsigned Idx = HasMemoryOperand ? 16+CC : CC; switch(RegBytes) { default: llvm_unreachable("Illegal register size!"); case 2: return Opc[Idx][0]; case 4: return Opc[Idx][1]; case 8: return Opc[Idx][2]; } } bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const { if (!MI->isTerminator()) return false; // Conditional branch is a special case. if (MI->isBranch() && !MI->isBarrier()) return true; if (!MI->isPredicable()) return true; return !isPredicated(MI); } bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify) const { // Start from the bottom of the block and work up, examining the // terminator instructions. MachineBasicBlock::iterator I = MBB.end(); MachineBasicBlock::iterator UnCondBrIter = MBB.end(); while (I != MBB.begin()) { --I; if (I->isDebugValue()) continue; // Working from the bottom, when we see a non-terminator instruction, we're // done. if (!isUnpredicatedTerminator(I)) break; // A terminator that isn't a branch can't easily be handled by this // analysis. if (!I->isBranch()) return true; // Handle unconditional branches. if (I->getOpcode() == X86::JMP_1) { UnCondBrIter = I; if (!AllowModify) { TBB = I->getOperand(0).getMBB(); continue; } // If the block has any instructions after a JMP, delete them. while (std::next(I) != MBB.end()) std::next(I)->eraseFromParent(); Cond.clear(); FBB = nullptr; // Delete the JMP if it's equivalent to a fall-through. if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) { TBB = nullptr; I->eraseFromParent(); I = MBB.end(); UnCondBrIter = MBB.end(); continue; } // TBB is used to indicate the unconditional destination. TBB = I->getOperand(0).getMBB(); continue; } // Handle conditional branches. X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode()); if (BranchCode == X86::COND_INVALID) return true; // Can't handle indirect branch. // Working from the bottom, handle the first conditional branch. if (Cond.empty()) { MachineBasicBlock *TargetBB = I->getOperand(0).getMBB(); if (AllowModify && UnCondBrIter != MBB.end() && MBB.isLayoutSuccessor(TargetBB)) { // If we can modify the code and it ends in something like: // // jCC L1 // jmp L2 // L1: // ... // L2: // // Then we can change this to: // // jnCC L2 // L1: // ... // L2: // // Which is a bit more efficient. // We conditionally jump to the fall-through block. BranchCode = GetOppositeBranchCondition(BranchCode); unsigned JNCC = GetCondBranchFromCond(BranchCode); MachineBasicBlock::iterator OldInst = I; BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC)) .addMBB(UnCondBrIter->getOperand(0).getMBB()); BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1)) .addMBB(TargetBB); OldInst->eraseFromParent(); UnCondBrIter->eraseFromParent(); // Restart the analysis. UnCondBrIter = MBB.end(); I = MBB.end(); continue; } FBB = TBB; TBB = I->getOperand(0).getMBB(); Cond.push_back(MachineOperand::CreateImm(BranchCode)); continue; } // Handle subsequent conditional branches. Only handle the case where all // conditional branches branch to the same destination and their condition // opcodes fit one of the special multi-branch idioms. assert(Cond.size() == 1); assert(TBB); // Only handle the case where all conditional branches branch to the same // destination. if (TBB != I->getOperand(0).getMBB()) return true; // If the conditions are the same, we can leave them alone. X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm(); if (OldBranchCode == BranchCode) continue; // If they differ, see if they fit one of the known patterns. Theoretically, // we could handle more patterns here, but we shouldn't expect to see them // if instruction selection has done a reasonable job. if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_E) || (OldBranchCode == X86::COND_E && BranchCode == X86::COND_NP)) BranchCode = X86::COND_NP_OR_E; else if ((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) || (OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P)) BranchCode = X86::COND_NE_OR_P; else return true; // Update the MachineOperand. Cond[0].setImm(BranchCode); } return false; } unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineBasicBlock::iterator I = MBB.end(); unsigned Count = 0; while (I != MBB.begin()) { --I; if (I->isDebugValue()) continue; if (I->getOpcode() != X86::JMP_1 && getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) break; // Remove the branch. I->eraseFromParent(); I = MBB.end(); ++Count; } return Count; } unsigned X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl &Cond, DebugLoc DL) const { // Shouldn't be a fall through. assert(TBB && "InsertBranch must not be told to insert a fallthrough"); assert((Cond.size() == 1 || Cond.size() == 0) && "X86 branch conditions have one component!"); if (Cond.empty()) { // Unconditional branch? assert(!FBB && "Unconditional branch with multiple successors!"); BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB); return 1; } // Conditional branch. unsigned Count = 0; X86::CondCode CC = (X86::CondCode)Cond[0].getImm(); switch (CC) { case X86::COND_NP_OR_E: // Synthesize NP_OR_E with two branches. BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB); ++Count; BuildMI(&MBB, DL, get(X86::JE_1)).addMBB(TBB); ++Count; break; case X86::COND_NE_OR_P: // Synthesize NE_OR_P with two branches. BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB); ++Count; BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB); ++Count; break; default: { unsigned Opc = GetCondBranchFromCond(CC); BuildMI(&MBB, DL, get(Opc)).addMBB(TBB); ++Count; } } if (FBB) { // Two-way Conditional branch. Insert the second branch. BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB); ++Count; } return Count; } bool X86InstrInfo:: canInsertSelect(const MachineBasicBlock &MBB, const SmallVectorImpl &Cond, unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { // Not all subtargets have cmov instructions. if (!Subtarget.hasCMov()) return false; if (Cond.size() != 1) return false; // We cannot do the composite conditions, at least not in SSA form. if ((X86::CondCode)Cond[0].getImm() > X86::COND_S) return false; // Check register classes. const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); const TargetRegisterClass *RC = RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg)); if (!RC) return false; // We have cmov instructions for 16, 32, and 64 bit general purpose registers. if (X86::GR16RegClass.hasSubClassEq(RC) || X86::GR32RegClass.hasSubClassEq(RC) || X86::GR64RegClass.hasSubClassEq(RC)) { // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy // Bridge. Probably Ivy Bridge as well. CondCycles = 2; TrueCycles = 2; FalseCycles = 2; return true; } // Can't do vectors. return false; } void X86InstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DstReg, const SmallVectorImpl &Cond, unsigned TrueReg, unsigned FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); assert(Cond.size() == 1 && "Invalid Cond array"); unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(), MRI.getRegClass(DstReg)->getSize(), false/*HasMemoryOperand*/); BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg); } /// isHReg - Test if the given register is a physical h register. static bool isHReg(unsigned Reg) { return X86::GR8_ABCD_HRegClass.contains(Reg); } // Try and copy between VR128/VR64 and GR64 registers. static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg, const X86Subtarget &Subtarget) { // SrcReg(VR128) -> DestReg(GR64) // SrcReg(VR64) -> DestReg(GR64) // SrcReg(GR64) -> DestReg(VR128) // SrcReg(GR64) -> DestReg(VR64) bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); if (X86::GR64RegClass.contains(DestReg)) { if (X86::VR128XRegClass.contains(SrcReg)) // Copy from a VR128 register to a GR64 register. return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr); if (X86::VR64RegClass.contains(SrcReg)) // Copy from a VR64 register to a GR64 register. return X86::MOVSDto64rr; } else if (X86::GR64RegClass.contains(SrcReg)) { // Copy from a GR64 register to a VR128 register. if (X86::VR128XRegClass.contains(DestReg)) return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr); // Copy from a GR64 register to a VR64 register. if (X86::VR64RegClass.contains(DestReg)) return X86::MOV64toSDrr; } // SrcReg(FR32) -> DestReg(GR32) // SrcReg(GR32) -> DestReg(FR32) if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg)) // Copy from a FR32 register to a GR32 register. return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr); if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg)) // Copy from a GR32 register to a FR32 register. return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr); return 0; } inline static bool MaskRegClassContains(unsigned Reg) { return X86::VK8RegClass.contains(Reg) || X86::VK16RegClass.contains(Reg) || X86::VK32RegClass.contains(Reg) || X86::VK64RegClass.contains(Reg) || X86::VK1RegClass.contains(Reg); } static unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) { if (X86::VR128XRegClass.contains(DestReg, SrcReg) || X86::VR256XRegClass.contains(DestReg, SrcReg) || X86::VR512RegClass.contains(DestReg, SrcReg)) { DestReg = get512BitSuperRegister(DestReg); SrcReg = get512BitSuperRegister(SrcReg); return X86::VMOVAPSZrr; } if (MaskRegClassContains(DestReg) && MaskRegClassContains(SrcReg)) return X86::KMOVWkk; if (MaskRegClassContains(DestReg) && (X86::GR32RegClass.contains(SrcReg) || X86::GR16RegClass.contains(SrcReg) || X86::GR8RegClass.contains(SrcReg))) { SrcReg = getX86SubSuperRegister(SrcReg, MVT::i32); return X86::KMOVWkr; } if ((X86::GR32RegClass.contains(DestReg) || X86::GR16RegClass.contains(DestReg) || X86::GR8RegClass.contains(DestReg)) && MaskRegClassContains(SrcReg)) { DestReg = getX86SubSuperRegister(DestReg, MVT::i32); return X86::KMOVWrk; } return 0; } void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { // First deal with the normal symmetric copies. bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); unsigned Opc = 0; if (X86::GR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV64rr; else if (X86::GR32RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV32rr; else if (X86::GR16RegClass.contains(DestReg, SrcReg)) Opc = X86::MOV16rr; else if (X86::GR8RegClass.contains(DestReg, SrcReg)) { // Copying to or from a physical H register on x86-64 requires a NOREX // move. Otherwise use a normal move. if ((isHReg(DestReg) || isHReg(SrcReg)) && Subtarget.is64Bit()) { Opc = X86::MOV8rr_NOREX; // Both operands must be encodable without an REX prefix. assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) && "8-bit H register can not be copied outside GR8_NOREX"); } else Opc = X86::MOV8rr; } else if (X86::VR64RegClass.contains(DestReg, SrcReg)) Opc = X86::MMX_MOVQ64rr; else if (HasAVX512) Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg); else if (X86::VR128RegClass.contains(DestReg, SrcReg)) Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr; else if (X86::VR256RegClass.contains(DestReg, SrcReg)) Opc = X86::VMOVAPSYrr; if (!Opc) Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget); if (Opc) { BuildMI(MBB, MI, DL, get(Opc), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } // Moving EFLAGS to / from another register requires a push and a pop. // Notice that we have to adjust the stack if we don't want to clobber the // first frame index. See X86FrameLowering.cpp - clobbersTheStack. if (SrcReg == X86::EFLAGS) { if (X86::GR64RegClass.contains(DestReg)) { BuildMI(MBB, MI, DL, get(X86::PUSHF64)); BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg); return; } if (X86::GR32RegClass.contains(DestReg)) { BuildMI(MBB, MI, DL, get(X86::PUSHF32)); BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg); return; } } if (DestReg == X86::EFLAGS) { if (X86::GR64RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(X86::PUSH64r)) .addReg(SrcReg, getKillRegState(KillSrc)); BuildMI(MBB, MI, DL, get(X86::POPF64)); return; } if (X86::GR32RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(X86::PUSH32r)) .addReg(SrcReg, getKillRegState(KillSrc)); BuildMI(MBB, MI, DL, get(X86::POPF32)); return; } } DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to " << RI.getName(DestReg) << '\n'); llvm_unreachable("Cannot emit physreg copy instruction"); } static unsigned getLoadStoreRegOpcode(unsigned Reg, const TargetRegisterClass *RC, bool isStackAligned, const X86Subtarget &STI, bool load) { if (STI.hasAVX512()) { if (X86::VK8RegClass.hasSubClassEq(RC) || X86::VK16RegClass.hasSubClassEq(RC)) return load ? X86::KMOVWkm : X86::KMOVWmk; if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC)) return load ? X86::VMOVSSZrm : X86::VMOVSSZmr; if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC)) return load ? X86::VMOVSDZrm : X86::VMOVSDZmr; if (X86::VR512RegClass.hasSubClassEq(RC)) return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; } bool HasAVX = STI.hasAVX(); switch (RC->getSize()) { default: llvm_unreachable("Unknown spill size"); case 1: assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass"); if (STI.is64Bit()) // Copying to or from a physical H register on x86-64 requires a NOREX // move. Otherwise use a normal move. if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC)) return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX; return load ? X86::MOV8rm : X86::MOV8mr; case 2: assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass"); return load ? X86::MOV16rm : X86::MOV16mr; case 4: if (X86::GR32RegClass.hasSubClassEq(RC)) return load ? X86::MOV32rm : X86::MOV32mr; if (X86::FR32RegClass.hasSubClassEq(RC)) return load ? (HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) : (HasAVX ? X86::VMOVSSmr : X86::MOVSSmr); if (X86::RFP32RegClass.hasSubClassEq(RC)) return load ? X86::LD_Fp32m : X86::ST_Fp32m; llvm_unreachable("Unknown 4-byte regclass"); case 8: if (X86::GR64RegClass.hasSubClassEq(RC)) return load ? X86::MOV64rm : X86::MOV64mr; if (X86::FR64RegClass.hasSubClassEq(RC)) return load ? (HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) : (HasAVX ? X86::VMOVSDmr : X86::MOVSDmr); if (X86::VR64RegClass.hasSubClassEq(RC)) return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr; if (X86::RFP64RegClass.hasSubClassEq(RC)) return load ? X86::LD_Fp64m : X86::ST_Fp64m; llvm_unreachable("Unknown 8-byte regclass"); case 10: assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass"); return load ? X86::LD_Fp80m : X86::ST_FpP80m; case 16: { assert((X86::VR128RegClass.hasSubClassEq(RC) || X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass"); // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? (HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm) : (HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr); else return load ? (HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm) : (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr); } case 32: assert((X86::VR256RegClass.hasSubClassEq(RC) || X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass"); // If stack is realigned we can use aligned stores. if (isStackAligned) return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr; else return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr; case 64: assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass"); if (isStackAligned) return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr; else return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr; } } static unsigned getStoreRegOpcode(unsigned SrcReg, const TargetRegisterClass *RC, bool isStackAligned, const X86Subtarget &STI) { return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false); } static unsigned getLoadRegOpcode(unsigned DestReg, const TargetRegisterClass *RC, bool isStackAligned, const X86Subtarget &STI) { return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true); } void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() && "Stack slot too small for store"); unsigned Alignment = std::max(RC->getSize(), 16); bool isAligned = (MF.getTarget() .getSubtargetImpl() ->getFrameLowering() ->getStackAlignment() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx) .addReg(SrcReg, getKillRegState(isKill)); } void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, SmallVectorImpl &Addr, const TargetRegisterClass *RC, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl &NewMIs) const { unsigned Alignment = std::max(RC->getSize(), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc)); for (unsigned i = 0, e = Addr.size(); i != e; ++i) MIB.addOperand(Addr[i]); MIB.addReg(SrcReg, getKillRegState(isKill)); (*MIB).setMemRefs(MMOBegin, MMOEnd); NewMIs.push_back(MIB); } void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, int FrameIdx, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max(RC->getSize(), 16); bool isAligned = (MF.getTarget() .getSubtargetImpl() ->getFrameLowering() ->getStackAlignment() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); DebugLoc DL = MBB.findDebugLoc(MI); addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx); } void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg, SmallVectorImpl &Addr, const TargetRegisterClass *RC, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl &NewMIs) const { unsigned Alignment = std::max(RC->getSize(), 16); bool isAligned = MMOBegin != MMOEnd && (*MMOBegin)->getAlignment() >= Alignment; unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); DebugLoc DL; MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg); for (unsigned i = 0, e = Addr.size(); i != e; ++i) MIB.addOperand(Addr[i]); (*MIB).setMemRefs(MMOBegin, MMOEnd); NewMIs.push_back(MIB); } bool X86InstrInfo:: analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const { switch (MI->getOpcode()) { default: break; case X86::CMP64ri32: case X86::CMP64ri8: case X86::CMP32ri: case X86::CMP32ri8: case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP8ri: SrcReg = MI->getOperand(0).getReg(); SrcReg2 = 0; CmpMask = ~0; CmpValue = MI->getOperand(1).getImm(); return true; // A SUB can be used to perform comparison. case X86::SUB64rm: case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: SrcReg = MI->getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; CmpValue = 0; return true; case X86::SUB64rr: case X86::SUB32rr: case X86::SUB16rr: case X86::SUB8rr: SrcReg = MI->getOperand(1).getReg(); SrcReg2 = MI->getOperand(2).getReg(); CmpMask = ~0; CmpValue = 0; return true; case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: SrcReg = MI->getOperand(1).getReg(); SrcReg2 = 0; CmpMask = ~0; CmpValue = MI->getOperand(2).getImm(); return true; case X86::CMP64rr: case X86::CMP32rr: case X86::CMP16rr: case X86::CMP8rr: SrcReg = MI->getOperand(0).getReg(); SrcReg2 = MI->getOperand(1).getReg(); CmpMask = ~0; CmpValue = 0; return true; case X86::TEST8rr: case X86::TEST16rr: case X86::TEST32rr: case X86::TEST64rr: SrcReg = MI->getOperand(0).getReg(); if (MI->getOperand(1).getReg() != SrcReg) return false; // Compare against zero. SrcReg2 = 0; CmpMask = ~0; CmpValue = 0; return true; } return false; } /// isRedundantFlagInstr - check whether the first instruction, whose only /// purpose is to update flags, can be made redundant. /// CMPrr can be made redundant by SUBrr if the operands are the same. /// This function can be extended later on. /// SrcReg, SrcRegs: register operands for FlagI. /// ImmValue: immediate for FlagI if it takes an immediate. inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg, unsigned SrcReg2, int ImmValue, MachineInstr *OI) { if (((FlagI->getOpcode() == X86::CMP64rr && OI->getOpcode() == X86::SUB64rr) || (FlagI->getOpcode() == X86::CMP32rr && OI->getOpcode() == X86::SUB32rr)|| (FlagI->getOpcode() == X86::CMP16rr && OI->getOpcode() == X86::SUB16rr)|| (FlagI->getOpcode() == X86::CMP8rr && OI->getOpcode() == X86::SUB8rr)) && ((OI->getOperand(1).getReg() == SrcReg && OI->getOperand(2).getReg() == SrcReg2) || (OI->getOperand(1).getReg() == SrcReg2 && OI->getOperand(2).getReg() == SrcReg))) return true; if (((FlagI->getOpcode() == X86::CMP64ri32 && OI->getOpcode() == X86::SUB64ri32) || (FlagI->getOpcode() == X86::CMP64ri8 && OI->getOpcode() == X86::SUB64ri8) || (FlagI->getOpcode() == X86::CMP32ri && OI->getOpcode() == X86::SUB32ri) || (FlagI->getOpcode() == X86::CMP32ri8 && OI->getOpcode() == X86::SUB32ri8) || (FlagI->getOpcode() == X86::CMP16ri && OI->getOpcode() == X86::SUB16ri) || (FlagI->getOpcode() == X86::CMP16ri8 && OI->getOpcode() == X86::SUB16ri8) || (FlagI->getOpcode() == X86::CMP8ri && OI->getOpcode() == X86::SUB8ri)) && OI->getOperand(1).getReg() == SrcReg && OI->getOperand(2).getImm() == ImmValue) return true; return false; } /// isDefConvertible - check whether the definition can be converted /// to remove a comparison against zero. inline static bool isDefConvertible(MachineInstr *MI) { switch (MI->getOpcode()) { default: return false; // The shift instructions only modify ZF if their shift count is non-zero. // N.B.: The processor truncates the shift count depending on the encoding. case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri: case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri: return getTruncatedShiftCount(MI, 2) != 0; // Some left shift instructions can be turned into LEA instructions but only // if their flags aren't used. Avoid transforming such instructions. case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{ unsigned ShAmt = getTruncatedShiftCount(MI, 2); if (isTruncatedShiftCountForLEA(ShAmt)) return false; return ShAmt != 0; } case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8: case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8: return getTruncatedShiftCount(MI, 3) != 0; case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr: case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm: case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r: case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr: case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm: case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm: case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r: case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri: case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8: case X86::AND8ri: case X86::AND64rr: case X86::AND32rr: case X86::AND16rr: case X86::AND8rr: case X86::AND64rm: case X86::AND32rm: case X86::AND16rm: case X86::AND8rm: case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri: case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8: case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr: case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm: case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm: case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri: case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8: case X86::OR8ri: case X86::OR64rr: case X86::OR32rr: case X86::OR16rr: case X86::OR8rr: case X86::OR64rm: case X86::OR32rm: case X86::OR16rm: case X86::OR8rm: case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r: case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1: case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1: case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1: case X86::ADC32ri: case X86::ADC32ri8: case X86::ADC32rr: case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC64rr: case X86::SBB32ri: case X86::SBB32ri8: case X86::SBB32rr: case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB64rr: case X86::ANDN32rr: case X86::ANDN32rm: case X86::ANDN64rr: case X86::ANDN64rm: case X86::BEXTR32rr: case X86::BEXTR64rr: case X86::BEXTR32rm: case X86::BEXTR64rm: case X86::BLSI32rr: case X86::BLSI32rm: case X86::BLSI64rr: case X86::BLSI64rm: case X86::BLSMSK32rr:case X86::BLSMSK32rm: case X86::BLSMSK64rr:case X86::BLSMSK64rm: case X86::BLSR32rr: case X86::BLSR32rm: case X86::BLSR64rr: case X86::BLSR64rm: case X86::BZHI32rr: case X86::BZHI32rm: case X86::BZHI64rr: case X86::BZHI64rm: case X86::LZCNT16rr: case X86::LZCNT16rm: case X86::LZCNT32rr: case X86::LZCNT32rm: case X86::LZCNT64rr: case X86::LZCNT64rm: case X86::POPCNT16rr:case X86::POPCNT16rm: case X86::POPCNT32rr:case X86::POPCNT32rm: case X86::POPCNT64rr:case X86::POPCNT64rm: case X86::TZCNT16rr: case X86::TZCNT16rm: case X86::TZCNT32rr: case X86::TZCNT32rm: case X86::TZCNT64rr: case X86::TZCNT64rm: return true; } } /// isUseDefConvertible - check whether the use can be converted /// to remove a comparison against zero. static X86::CondCode isUseDefConvertible(MachineInstr *MI) { switch (MI->getOpcode()) { default: return X86::COND_INVALID; case X86::LZCNT16rr: case X86::LZCNT16rm: case X86::LZCNT32rr: case X86::LZCNT32rm: case X86::LZCNT64rr: case X86::LZCNT64rm: return X86::COND_B; case X86::POPCNT16rr:case X86::POPCNT16rm: case X86::POPCNT32rr:case X86::POPCNT32rm: case X86::POPCNT64rr:case X86::POPCNT64rm: return X86::COND_E; case X86::TZCNT16rr: case X86::TZCNT16rm: case X86::TZCNT32rr: case X86::TZCNT32rm: case X86::TZCNT64rr: case X86::TZCNT64rm: return X86::COND_B; } } /// optimizeCompareInstr - Check if there exists an earlier instruction that /// operates on the same source operands and sets flags in the same way as /// Compare; remove Compare if possible. bool X86InstrInfo:: optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const { // Check whether we can replace SUB with CMP. unsigned NewOpcode = 0; switch (CmpInstr->getOpcode()) { default: break; case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB8ri: case X86::SUB64rm: case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm: case X86::SUB64rr: case X86::SUB32rr: case X86::SUB16rr: case X86::SUB8rr: { if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg())) return false; // There is no use of the destination register, we can replace SUB with CMP. switch (CmpInstr->getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::SUB64rm: NewOpcode = X86::CMP64rm; break; case X86::SUB32rm: NewOpcode = X86::CMP32rm; break; case X86::SUB16rm: NewOpcode = X86::CMP16rm; break; case X86::SUB8rm: NewOpcode = X86::CMP8rm; break; case X86::SUB64rr: NewOpcode = X86::CMP64rr; break; case X86::SUB32rr: NewOpcode = X86::CMP32rr; break; case X86::SUB16rr: NewOpcode = X86::CMP16rr; break; case X86::SUB8rr: NewOpcode = X86::CMP8rr; break; case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break; case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break; case X86::SUB32ri: NewOpcode = X86::CMP32ri; break; case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break; case X86::SUB16ri: NewOpcode = X86::CMP16ri; break; case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break; case X86::SUB8ri: NewOpcode = X86::CMP8ri; break; } CmpInstr->setDesc(get(NewOpcode)); CmpInstr->RemoveOperand(0); // Fall through to optimize Cmp if Cmp is CMPrr or CMPri. if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm || NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm) return false; } } // Get the unique definition of SrcReg. MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg); if (!MI) return false; // CmpInstr is the first instruction of the BB. MachineBasicBlock::iterator I = CmpInstr, Def = MI; // If we are comparing against zero, check whether we can use MI to update // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize. bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0); if (IsCmpZero && MI->getParent() != CmpInstr->getParent()) return false; // If we have a use of the source register between the def and our compare // instruction we can eliminate the compare iff the use sets EFLAGS in the // right way. bool ShouldUpdateCC = false; X86::CondCode NewCC = X86::COND_INVALID; if (IsCmpZero && !isDefConvertible(MI)) { // Scan forward from the use until we hit the use we're looking for or the // compare instruction. for (MachineBasicBlock::iterator J = MI;; ++J) { // Do we have a convertible instruction? NewCC = isUseDefConvertible(J); if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() && J->getOperand(1).getReg() == SrcReg) { assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!"); ShouldUpdateCC = true; // Update CC later on. // This is not a def of SrcReg, but still a def of EFLAGS. Keep going // with the new def. MI = Def = J; break; } if (J == I) return false; } } // We are searching for an earlier instruction that can make CmpInstr // redundant and that instruction will be saved in Sub. MachineInstr *Sub = nullptr; const TargetRegisterInfo *TRI = &getRegisterInfo(); // We iterate backward, starting from the instruction before CmpInstr and // stop when reaching the definition of a source register or done with the BB. // RI points to the instruction before CmpInstr. // If the definition is in this basic block, RE points to the definition; // otherwise, RE is the rend of the basic block. MachineBasicBlock::reverse_iterator RI = MachineBasicBlock::reverse_iterator(I), RE = CmpInstr->getParent() == MI->getParent() ? MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ : CmpInstr->getParent()->rend(); MachineInstr *Movr0Inst = nullptr; for (; RI != RE; ++RI) { MachineInstr *Instr = &*RI; // Check whether CmpInstr can be made redundant by the current instruction. if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) { Sub = Instr; break; } if (Instr->modifiesRegister(X86::EFLAGS, TRI) || Instr->readsRegister(X86::EFLAGS, TRI)) { // This instruction modifies or uses EFLAGS. // MOV32r0 etc. are implemented with xor which clobbers condition code. // They are safe to move up, if the definition to EFLAGS is dead and // earlier instructions do not read or write EFLAGS. if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 && Instr->registerDefIsDead(X86::EFLAGS, TRI)) { Movr0Inst = Instr; continue; } // We can't remove CmpInstr. return false; } } // Return false if no candidates exist. if (!IsCmpZero && !Sub) return false; bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 && Sub->getOperand(2).getReg() == SrcReg); // Scan forward from the instruction after CmpInstr for uses of EFLAGS. // It is safe to remove CmpInstr if EFLAGS is redefined or killed. // If we are done with the basic block, we need to check whether EFLAGS is // live-out. bool IsSafe = false; SmallVector, 4> OpsToUpdate; MachineBasicBlock::iterator E = CmpInstr->getParent()->end(); for (++I; I != E; ++I) { const MachineInstr &Instr = *I; bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI); bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI); // We should check the usage if this instruction uses and updates EFLAGS. if (!UseEFLAGS && ModifyEFLAGS) { // It is safe to remove CmpInstr if EFLAGS is updated again. IsSafe = true; break; } if (!UseEFLAGS && !ModifyEFLAGS) continue; // EFLAGS is used by this instruction. X86::CondCode OldCC = X86::COND_INVALID; bool OpcIsSET = false; if (IsCmpZero || IsSwapped) { // We decode the condition code from opcode. if (Instr.isBranch()) OldCC = getCondFromBranchOpc(Instr.getOpcode()); else { OldCC = getCondFromSETOpc(Instr.getOpcode()); if (OldCC != X86::COND_INVALID) OpcIsSET = true; else OldCC = X86::getCondFromCMovOpc(Instr.getOpcode()); } if (OldCC == X86::COND_INVALID) return false; } if (IsCmpZero) { switch (OldCC) { default: break; case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: case X86::COND_G: case X86::COND_GE: case X86::COND_L: case X86::COND_LE: case X86::COND_O: case X86::COND_NO: // CF and OF are used, we can't perform this optimization. return false; } // If we're updating the condition code check if we have to reverse the // condition. if (ShouldUpdateCC) switch (OldCC) { default: return false; case X86::COND_E: break; case X86::COND_NE: NewCC = GetOppositeBranchCondition(NewCC); break; } } else if (IsSwapped) { // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. // We swap the condition code and synthesize the new opcode. NewCC = getSwappedCondition(OldCC); if (NewCC == X86::COND_INVALID) return false; } if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) { // Synthesize the new opcode. bool HasMemoryOperand = Instr.hasOneMemOperand(); unsigned NewOpc; if (Instr.isBranch()) NewOpc = GetCondBranchFromCond(NewCC); else if(OpcIsSET) NewOpc = getSETFromCond(NewCC, HasMemoryOperand); else { unsigned DstReg = Instr.getOperand(0).getReg(); NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(), HasMemoryOperand); } // Push the MachineInstr to OpsToUpdate. // If it is safe to remove CmpInstr, the condition code of these // instructions will be modified. OpsToUpdate.push_back(std::make_pair(&*I, NewOpc)); } if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) { // It is safe to remove CmpInstr if EFLAGS is updated again or killed. IsSafe = true; break; } } // If EFLAGS is not killed nor re-defined, we should check whether it is // live-out. If it is live-out, do not optimize. if ((IsCmpZero || IsSwapped) && !IsSafe) { MachineBasicBlock *MBB = CmpInstr->getParent(); for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) if ((*SI)->isLiveIn(X86::EFLAGS)) return false; } // The instruction to be updated is either Sub or MI. Sub = IsCmpZero ? MI : Sub; // Move Movr0Inst to the appropriate place before Sub. if (Movr0Inst) { // Look backwards until we find a def that doesn't use the current EFLAGS. Def = Sub; MachineBasicBlock::reverse_iterator InsertI = MachineBasicBlock::reverse_iterator(++Def), InsertE = Sub->getParent()->rend(); for (; InsertI != InsertE; ++InsertI) { MachineInstr *Instr = &*InsertI; if (!Instr->readsRegister(X86::EFLAGS, TRI) && Instr->modifiesRegister(X86::EFLAGS, TRI)) { Sub->getParent()->remove(Movr0Inst); Instr->getParent()->insert(MachineBasicBlock::iterator(Instr), Movr0Inst); break; } } if (InsertI == InsertE) return false; } // Make sure Sub instruction defines EFLAGS and mark the def live. unsigned i = 0, e = Sub->getNumOperands(); for (; i != e; ++i) { MachineOperand &MO = Sub->getOperand(i); if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) { MO.setIsDead(false); break; } } assert(i != e && "Unable to locate a def EFLAGS operand"); CmpInstr->eraseFromParent(); // Modify the condition code of instructions in OpsToUpdate. for (unsigned i = 0, e = OpsToUpdate.size(); i < e; i++) OpsToUpdate[i].first->setDesc(get(OpsToUpdate[i].second)); return true; } /// optimizeLoadInstr - Try to remove the load by folding it to a register /// operand at the use. We fold the load instructions if load defines a virtual /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const { if (FoldAsLoadDefReg == 0) return nullptr; // To be conservative, if there exists another load, clear the load candidate. if (MI->mayLoad()) { FoldAsLoadDefReg = 0; return nullptr; } // Check whether we can move DefMI here. DefMI = MRI->getVRegDef(FoldAsLoadDefReg); assert(DefMI); bool SawStore = false; if (!DefMI->isSafeToMove(this, nullptr, SawStore)) return nullptr; // Collect information about virtual register operands of MI. unsigned SrcOperandId = 0; bool FoundSrcOperand = false; for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (!MO.isReg()) continue; unsigned Reg = MO.getReg(); if (Reg != FoldAsLoadDefReg) continue; // Do not fold if we have a subreg use or a def or multiple uses. if (MO.getSubReg() || MO.isDef() || FoundSrcOperand) return nullptr; SrcOperandId = i; FoundSrcOperand = true; } if (!FoundSrcOperand) return nullptr; // Check whether we can fold the def into SrcOperandId. SmallVector Ops; Ops.push_back(SrcOperandId); MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI); if (FoldMI) { FoldAsLoadDefReg = 0; return FoldMI; } return nullptr; } /// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr /// instruction with two undef reads of the register being defined. This is /// used for mapping: /// %xmm4 = V_SET0 /// to: /// %xmm4 = PXORrr %xmm4, %xmm4 /// static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction."); unsigned Reg = MIB->getOperand(0).getReg(); MIB->setDesc(Desc); // MachineInstr::addOperand() will insert explicit operands before any // implicit operands. MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); // But we don't trust that. assert(MIB->getOperand(1).getReg() == Reg && MIB->getOperand(2).getReg() == Reg && "Misplaced operand"); return true; } // LoadStackGuard has so far only been implemented for 64-bit MachO. Different // code sequence is needed for other targets. static void expandLoadStackGuard(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) { MachineBasicBlock &MBB = *MIB->getParent(); DebugLoc DL = MIB->getDebugLoc(); unsigned Reg = MIB->getOperand(0).getReg(); const GlobalValue *GV = cast((*MIB->memoperands_begin())->getValue()); unsigned Flag = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant; MachineMemOperand *MMO = MBB.getParent()-> getMachineMemOperand(MachinePointerInfo::getGOT(), Flag, 8, 8); MachineBasicBlock::iterator I = MIB.getInstr(); BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1) .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0) .addMemOperand(MMO); MIB->setDebugLoc(DL); MIB->setDesc(TII.get(X86::MOV64rm)); MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); } bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { bool HasAVX = Subtarget.hasAVX(); MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI); switch (MI->getOpcode()) { case X86::MOV32r0: return Expand2AddrUndef(MIB, get(X86::XOR32rr)); case X86::SETB_C8r: return Expand2AddrUndef(MIB, get(X86::SBB8rr)); case X86::SETB_C16r: return Expand2AddrUndef(MIB, get(X86::SBB16rr)); case X86::SETB_C32r: return Expand2AddrUndef(MIB, get(X86::SBB32rr)); case X86::SETB_C64r: return Expand2AddrUndef(MIB, get(X86::SBB64rr)); case X86::V_SET0: case X86::FsFLD0SS: case X86::FsFLD0SD: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); case X86::AVX_SET0: assert(HasAVX && "AVX not supported"); return Expand2AddrUndef(MIB, get(X86::VXORPSYrr)); case X86::AVX512_512_SET0: return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); case X86::TEST8ri_NOREX: MI->setDesc(get(X86::TEST8ri)); return true; case X86::KSET0B: case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr)); case X86::KSET1B: case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr)); case TargetOpcode::LOAD_STACK_GUARD: expandLoadStackGuard(MIB, *this); return true; } return false; } static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode, const SmallVectorImpl &MOs, MachineInstr *MI, const TargetInstrInfo &TII) { // Create the base instruction with the memory operand as the first part. // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); MachineInstrBuilder MIB(MF, NewMI); unsigned NumAddrOps = MOs.size(); for (unsigned i = 0; i != NumAddrOps; ++i) MIB.addOperand(MOs[i]); if (NumAddrOps < 4) // FrameIndex only addOffset(MIB, 0); // Loop over the rest of the ri operands, converting them over. unsigned NumOps = MI->getDesc().getNumOperands()-2; for (unsigned i = 0; i != NumOps; ++i) { MachineOperand &MO = MI->getOperand(i+2); MIB.addOperand(MO); } for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); MIB.addOperand(MO); } return MIB; } static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode, unsigned OpNo, const SmallVectorImpl &MOs, MachineInstr *MI, const TargetInstrInfo &TII) { // Omit the implicit operands, something BuildMI can't do. MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode), MI->getDebugLoc(), true); MachineInstrBuilder MIB(MF, NewMI); for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &MO = MI->getOperand(i); if (i == OpNo) { assert(MO.isReg() && "Expected to fold into reg operand!"); unsigned NumAddrOps = MOs.size(); for (unsigned i = 0; i != NumAddrOps; ++i) MIB.addOperand(MOs[i]); if (NumAddrOps < 4) // FrameIndex only addOffset(MIB, 0); } else { MIB.addOperand(MO); } } return MIB; } static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, const SmallVectorImpl &MOs, MachineInstr *MI) { MachineFunction &MF = *MI->getParent()->getParent(); MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode)); unsigned NumAddrOps = MOs.size(); for (unsigned i = 0; i != NumAddrOps; ++i) MIB.addOperand(MOs[i]); if (NumAddrOps < 4) // FrameIndex only addOffset(MIB, 0); return MIB.addImm(0); } MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, unsigned i, const SmallVectorImpl &MOs, unsigned Size, unsigned Align, bool AllowCommute) const { const DenseMap > *OpcodeTablePtr = nullptr; bool isCallRegIndirect = Subtarget.callRegIndirect(); bool isTwoAddrFold = false; // Atom favors register form of call. So, we do not fold loads into calls // when X86Subtarget is Atom. if (isCallRegIndirect && (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) { return nullptr; } unsigned NumOps = MI->getDesc().getNumOperands(); bool isTwoAddr = NumOps > 1 && MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; // FIXME: AsmPrinter doesn't know how to handle // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. if (MI->getOpcode() == X86::ADD32ri && MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) return nullptr; MachineInstr *NewMI = nullptr; // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. if (isTwoAddr && NumOps >= 2 && i < 2 && MI->getOperand(0).isReg() && MI->getOperand(1).isReg() && MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; isTwoAddrFold = true; } else if (i == 0) { // If operand 0 if (MI->getOpcode() == X86::MOV32r0) { NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI); if (NewMI) return NewMI; } OpcodeTablePtr = &RegOp2MemOpTable0; } else if (i == 1) { OpcodeTablePtr = &RegOp2MemOpTable1; } else if (i == 2) { OpcodeTablePtr = &RegOp2MemOpTable2; } else if (i == 3) { OpcodeTablePtr = &RegOp2MemOpTable3; } else if (i == 4) { OpcodeTablePtr = &RegOp2MemOpTable4; } // If table selected... if (OpcodeTablePtr) { // Find the Opcode to fuse DenseMap >::const_iterator I = OpcodeTablePtr->find(MI->getOpcode()); if (I != OpcodeTablePtr->end()) { unsigned Opcode = I->second.first; unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; if (Align < MinAlign) return nullptr; bool NarrowToMOV32rm = false; if (Size) { unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize(); if (Size < RCSize) { // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) return nullptr; // If this is a 64-bit load, but the spill slot is 32, then we can do // a 32-bit load which is implicitly zero-extended. This likely is // due to live interval analysis remat'ing a load from stack slot. if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg()) return nullptr; Opcode = X86::MOV32rm; NarrowToMOV32rm = true; } } if (isTwoAddrFold) NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this); else NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this); if (NarrowToMOV32rm) { // If this is the special case where we use a MOV32rm to load a 32-bit // value and zero-extend the top bits. Change the destination register // to a 32-bit one. unsigned DstReg = NewMI->getOperand(0).getReg(); if (TargetRegisterInfo::isPhysicalRegister(DstReg)) NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit)); else NewMI->getOperand(0).setSubReg(X86::sub_32bit); } return NewMI; } } // If the instruction and target operand are commutable, commute the // instruction and try again. if (AllowCommute) { unsigned OriginalOpIdx = i, CommuteOpIdx1, CommuteOpIdx2; if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) { bool HasDef = MI->getDesc().getNumDefs(); unsigned Reg0 = HasDef ? MI->getOperand(0).getReg() : 0; unsigned Reg1 = MI->getOperand(CommuteOpIdx1).getReg(); unsigned Reg2 = MI->getOperand(CommuteOpIdx2).getReg(); bool Tied0 = 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO); bool Tied1 = 0 == MI->getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO); // If either of the commutable operands are tied to the destination // then we can not commute + fold. if ((HasDef && Reg0 == Reg1 && Tied0) || (HasDef && Reg0 == Reg2 && Tied1)) return nullptr; if ((CommuteOpIdx1 == OriginalOpIdx) || (CommuteOpIdx2 == OriginalOpIdx)) { MachineInstr *CommutedMI = commuteInstruction(MI, false); if (!CommutedMI) { // Unable to commute. return nullptr; } if (CommutedMI != MI) { // New instruction. We can't fold from this. CommutedMI->eraseFromParent(); return nullptr; } // Attempt to fold with the commuted version of the instruction. unsigned CommuteOp = (CommuteOpIdx1 == OriginalOpIdx ? CommuteOpIdx2 : CommuteOpIdx1); NewMI = foldMemoryOperandImpl(MF, MI, CommuteOp, MOs, Size, Align, /*AllowCommute=*/false); if (NewMI) return NewMI; // Folding failed again - undo the commute before returning. MachineInstr *UncommutedMI = commuteInstruction(MI, false); if (!UncommutedMI) { // Unable to commute. return nullptr; } if (UncommutedMI != MI) { // New instruction. It doesn't need to be kept. UncommutedMI->eraseFromParent(); return nullptr; } // Return here to prevent duplicate fuse failure report. return nullptr; } } } // No fusion if (PrintFailedFusing && !MI->isCopy()) dbgs() << "We failed to fuse operand " << i << " in " << *MI; return nullptr; } /// hasPartialRegUpdate - Return true for all instructions that only update /// the first 32 or 64-bits of the destination register and leave the rest /// unmodified. This can be used to avoid folding loads if the instructions /// only update part of the destination register, and the non-updated part is /// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these /// instructions breaks the partial register dependency and it can improve /// performance. e.g.: /// /// movss (%rdi), %xmm0 /// cvtss2sd %xmm0, %xmm0 /// /// Instead of /// cvtss2sd (%rdi), %xmm0 /// /// FIXME: This should be turned into a TSFlags. /// static bool hasPartialRegUpdate(unsigned Opcode) { switch (Opcode) { case X86::CVTSI2SSrr: case X86::CVTSI2SSrm: case X86::CVTSI2SS64rr: case X86::CVTSI2SS64rm: case X86::CVTSI2SDrr: case X86::CVTSI2SDrm: case X86::CVTSI2SD64rr: case X86::CVTSI2SD64rm: case X86::CVTSD2SSrr: case X86::CVTSD2SSrm: case X86::Int_CVTSD2SSrr: case X86::Int_CVTSD2SSrm: case X86::CVTSS2SDrr: case X86::CVTSS2SDrm: case X86::Int_CVTSS2SDrr: case X86::Int_CVTSS2SDrm: case X86::RCPSSr: case X86::RCPSSm: case X86::RCPSSr_Int: case X86::RCPSSm_Int: case X86::ROUNDSDr: case X86::ROUNDSDm: case X86::ROUNDSDr_Int: case X86::ROUNDSSr: case X86::ROUNDSSm: case X86::ROUNDSSr_Int: case X86::RSQRTSSr: case X86::RSQRTSSm: case X86::RSQRTSSr_Int: case X86::RSQRTSSm_Int: case X86::SQRTSSr: case X86::SQRTSSm: case X86::SQRTSSr_Int: case X86::SQRTSSm_Int: case X86::SQRTSDr: case X86::SQRTSDm: case X86::SQRTSDr_Int: case X86::SQRTSDm_Int: return true; } return false; } /// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle /// instructions we would like before a partial register update. unsigned X86InstrInfo:: getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode())) return 0; // If MI is marked as reading Reg, the partial register update is wanted. const MachineOperand &MO = MI->getOperand(0); unsigned Reg = MO.getReg(); if (TargetRegisterInfo::isVirtualRegister(Reg)) { if (MO.readsReg() || MI->readsVirtualRegister(Reg)) return 0; } else { if (MI->readsRegister(Reg, TRI)) return 0; } // If any of the preceding 16 instructions are reading Reg, insert a // dependency breaking instruction. The magic number is based on a few // Nehalem experiments. return 16; } // Return true for any instruction the copies the high bits of the first source // operand into the unused high bits of the destination operand. static bool hasUndefRegUpdate(unsigned Opcode) { switch (Opcode) { case X86::VCVTSI2SSrr: case X86::VCVTSI2SSrm: case X86::Int_VCVTSI2SSrr: case X86::Int_VCVTSI2SSrm: case X86::VCVTSI2SS64rr: case X86::VCVTSI2SS64rm: case X86::Int_VCVTSI2SS64rr: case X86::Int_VCVTSI2SS64rm: case X86::VCVTSI2SDrr: case X86::VCVTSI2SDrm: case X86::Int_VCVTSI2SDrr: case X86::Int_VCVTSI2SDrm: case X86::VCVTSI2SD64rr: case X86::VCVTSI2SD64rm: case X86::Int_VCVTSI2SD64rr: case X86::Int_VCVTSI2SD64rm: case X86::VCVTSD2SSrr: case X86::VCVTSD2SSrm: case X86::Int_VCVTSD2SSrr: case X86::Int_VCVTSD2SSrm: case X86::VCVTSS2SDrr: case X86::VCVTSS2SDrm: case X86::Int_VCVTSS2SDrr: case X86::Int_VCVTSS2SDrm: case X86::VRCPSSr: case X86::VRCPSSm: case X86::VRCPSSm_Int: case X86::VROUNDSDr: case X86::VROUNDSDm: case X86::VROUNDSDr_Int: case X86::VROUNDSSr: case X86::VROUNDSSm: case X86::VROUNDSSr_Int: case X86::VRSQRTSSr: case X86::VRSQRTSSm: case X86::VRSQRTSSm_Int: case X86::VSQRTSSr: case X86::VSQRTSSm: case X86::VSQRTSSm_Int: case X86::VSQRTSDr: case X86::VSQRTSDm: case X86::VSQRTSDm_Int: // AVX-512 case X86::VCVTSD2SSZrr: case X86::VCVTSD2SSZrm: case X86::VCVTSS2SDZrr: case X86::VCVTSS2SDZrm: return true; } return false; } /// Inform the ExeDepsFix pass how many idle instructions we would like before /// certain undef register reads. /// /// This catches the VCVTSI2SD family of instructions: /// /// vcvtsi2sdq %rax, %xmm0, %xmm14 /// /// We should to be careful *not* to catch VXOR idioms which are presumably /// handled specially in the pipeline: /// /// vxorps %xmm1, %xmm1, %xmm1 /// /// Like getPartialRegUpdateClearance, this makes a strong assumption that the /// high bits that are passed-through are not live. unsigned X86InstrInfo:: getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum, const TargetRegisterInfo *TRI) const { if (!hasUndefRegUpdate(MI->getOpcode())) return 0; // Set the OpNum parameter to the first source operand. OpNum = 1; const MachineOperand &MO = MI->getOperand(OpNum); if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { // Use the same magic number as getPartialRegUpdateClearance. return 16; } return 0; } void X86InstrInfo:: breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { unsigned Reg = MI->getOperand(OpNum).getReg(); // If MI kills this register, the false dependence is already broken. if (MI->killsRegister(Reg, TRI)) return; if (X86::VR128RegClass.contains(Reg)) { // These instructions are all floating point domain, so xorps is the best // choice. bool HasAVX = Subtarget.hasAVX(); unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr; BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg) .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef); } else if (X86::VR256RegClass.contains(Reg)) { // Use vxorps to clear the full ymm register. // It wants to read and write the xmm sub-register. unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg) .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef) .addReg(Reg, RegState::ImplicitDefine); } else return; MI->addRegisterKilled(Reg, TRI, true); } MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl &Ops, int FrameIndex) const { // Check switch flag if (NoFusing) return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls if (!MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) return nullptr; const MachineFrameInfo *MFI = MF.getFrameInfo(); unsigned Size = MFI->getObjectSize(FrameIndex); unsigned Alignment = MFI->getObjectAlignment(FrameIndex); // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) Alignment = std::min(Alignment, MF.getTarget() .getSubtargetImpl() ->getFrameLowering() ->getStackAlignment()); if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; switch (MI->getOpcode()) { default: return nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break; case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break; case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break; case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break; } // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. if (Size < RCSize) return nullptr; // Change to CMPXXri r, 0 first. MI->setDesc(get(NewOpc)); MI->getOperand(1).ChangeToImmediate(0); } else if (Ops.size() != 1) return nullptr; SmallVector MOs; MOs.push_back(MachineOperand::CreateFI(FrameIndex)); return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment, /*AllowCommute=*/true); } static bool isPartialRegisterLoad(const MachineInstr &LoadMI, const MachineFunction &MF) { unsigned Opc = LoadMI.getOpcode(); unsigned RegSize = MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize(); if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm) && RegSize > 4) // These instructions only load 32 bits, we can't fold them if the // destination register is wider than 32 bits (4 bytes). return true; if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm) && RegSize > 8) // These instructions only load 64 bits, we can't fold them if the // destination register is wider than 64 bits (8 bytes). return true; return false; } MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI, const SmallVectorImpl &Ops, MachineInstr *LoadMI) const { // If loading from a FrameIndex, fold directly from the FrameIndex. unsigned NumOps = LoadMI->getDesc().getNumOperands(); int FrameIndex; if (isLoadFromStackSlot(LoadMI, FrameIndex)) { if (isPartialRegisterLoad(*LoadMI, MF)) return nullptr; return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex); } // Check switch flag if (NoFusing) return nullptr; // Unless optimizing for size, don't fold to avoid partial // register update stalls if (!MF.getFunction()->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) && hasPartialRegUpdate(MI->getOpcode())) return nullptr; // Determine the alignment of the load. unsigned Alignment = 0; if (LoadMI->hasOneMemOperand()) Alignment = (*LoadMI->memoperands_begin())->getAlignment(); else switch (LoadMI->getOpcode()) { case X86::AVX2_SETALLONES: case X86::AVX_SET0: Alignment = 32; break; case X86::V_SET0: case X86::V_SETALLONES: Alignment = 16; break; case X86::FsFLD0SD: Alignment = 8; break; case X86::FsFLD0SS: Alignment = 4; break; default: return nullptr; } if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; switch (MI->getOpcode()) { default: return nullptr; case X86::TEST8rr: NewOpc = X86::CMP8ri; break; case X86::TEST16rr: NewOpc = X86::CMP16ri8; break; case X86::TEST32rr: NewOpc = X86::CMP32ri8; break; case X86::TEST64rr: NewOpc = X86::CMP64ri8; break; } // Change to CMPXXri r, 0 first. MI->setDesc(get(NewOpc)); MI->getOperand(1).ChangeToImmediate(0); } else if (Ops.size() != 1) return nullptr; // Make sure the subregisters match. // Otherwise we risk changing the size of the load. if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg()) return nullptr; SmallVector MOs; switch (LoadMI->getOpcode()) { case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX2_SETALLONES: case X86::AVX_SET0: case X86::FsFLD0SD: case X86::FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. // Medium and large mode can't fold loads this way. if (MF.getTarget().getCodeModel() != CodeModel::Small && MF.getTarget().getCodeModel() != CodeModel::Kernel) return nullptr; // x86-32 PIC requires a PIC base register for constant pools. unsigned PICBase = 0; if (MF.getTarget().getRelocationModel() == Reloc::PIC_) { if (Subtarget.is64Bit()) PICBase = X86::RIP; else // FIXME: PICBase = getGlobalBaseReg(&MF); // This doesn't work for several reasons. // 1. GlobalBaseReg may have been spilled. // 2. It may not be live at MI. return nullptr; } // Create a constant-pool entry. MachineConstantPool &MCP = *MF.getConstantPool(); Type *Ty; unsigned Opc = LoadMI->getOpcode(); if (Opc == X86::FsFLD0SS) Ty = Type::getFloatTy(MF.getFunction()->getContext()); else if (Opc == X86::FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); // Create operands to load from the constant pool entry. MOs.push_back(MachineOperand::CreateReg(PICBase, false)); MOs.push_back(MachineOperand::CreateImm(1)); MOs.push_back(MachineOperand::CreateReg(0, false)); MOs.push_back(MachineOperand::CreateCPI(CPI, 0)); MOs.push_back(MachineOperand::CreateReg(0, false)); break; } default: { if (isPartialRegisterLoad(*LoadMI, MF)) return nullptr; // Folding a normal load. Just copy the load's address operands. for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i) MOs.push_back(LoadMI->getOperand(i)); break; } } return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, /*Size=*/0, Alignment, /*AllowCommute=*/true); } bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI, const SmallVectorImpl &Ops) const { // Check switch flag if (NoFusing) return 0; if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { switch (MI->getOpcode()) { default: return false; case X86::TEST8rr: case X86::TEST16rr: case X86::TEST32rr: case X86::TEST64rr: return true; case X86::ADD32ri: // FIXME: AsmPrinter doesn't know how to handle // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding. if (MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS) return false; break; } } if (Ops.size() != 1) return false; unsigned OpNum = Ops[0]; unsigned Opc = MI->getOpcode(); unsigned NumOps = MI->getDesc().getNumOperands(); bool isTwoAddr = NumOps > 1 && MI->getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; // Folding a memory location into the two-address part of a two-address // instruction is different than folding it other places. It requires // replacing the *two* registers with the memory location. const DenseMap > *OpcodeTablePtr = nullptr; if (isTwoAddr && NumOps >= 2 && OpNum < 2) { OpcodeTablePtr = &RegOp2MemOpTable2Addr; } else if (OpNum == 0) { // If operand 0 if (Opc == X86::MOV32r0) return true; OpcodeTablePtr = &RegOp2MemOpTable0; } else if (OpNum == 1) { OpcodeTablePtr = &RegOp2MemOpTable1; } else if (OpNum == 2) { OpcodeTablePtr = &RegOp2MemOpTable2; } else if (OpNum == 3) { OpcodeTablePtr = &RegOp2MemOpTable3; } if (OpcodeTablePtr && OpcodeTablePtr->count(Opc)) return true; return TargetInstrInfo::canFoldMemoryOperand(MI, Ops); } bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl &NewMIs) const { DenseMap >::const_iterator I = MemOp2RegOpTable.find(MI->getOpcode()); if (I == MemOp2RegOpTable.end()) return false; unsigned Opc = I->second.first; unsigned Index = I->second.second & TB_INDEX_MASK; bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; bool FoldedStore = I->second.second & TB_FOLDED_STORE; if (UnfoldLoad && !FoldedLoad) return false; UnfoldLoad &= FoldedLoad; if (UnfoldStore && !FoldedStore) return false; UnfoldStore &= FoldedStore; const MCInstrDesc &MCID = get(Opc); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); if (!MI->hasOneMemOperand() && RC == &X86::VR128RegClass && !Subtarget.isUnalignedMemAccessFast()) // Without memoperands, loadRegFromAddr and storeRegToStackSlot will // conservatively assume the address is unaligned. That's bad for // performance. return false; SmallVector AddrOps; SmallVector BeforeOps; SmallVector AfterOps; SmallVector ImpOps; for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { MachineOperand &Op = MI->getOperand(i); if (i >= Index && i < Index + X86::AddrNumOperands) AddrOps.push_back(Op); else if (Op.isReg() && Op.isImplicit()) ImpOps.push_back(Op); else if (i < Index) BeforeOps.push_back(Op); else if (i > Index) AfterOps.push_back(Op); } // Emit the load instruction. if (UnfoldLoad) { std::pair MMOs = MF.extractLoadMemRefs(MI->memoperands_begin(), MI->memoperands_end()); loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs); if (UnfoldStore) { // Address operands cannot be marked isKill. for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) { MachineOperand &MO = NewMIs[0]->getOperand(i); if (MO.isReg()) MO.setIsKill(false); } } } // Emit the data processing instruction. MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true); MachineInstrBuilder MIB(MF, DataMI); if (FoldedStore) MIB.addReg(Reg, RegState::Define); for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i) MIB.addOperand(BeforeOps[i]); if (FoldedLoad) MIB.addReg(Reg); for (unsigned i = 0, e = AfterOps.size(); i != e; ++i) MIB.addOperand(AfterOps[i]); for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) { MachineOperand &MO = ImpOps[i]; MIB.addReg(MO.getReg(), getDefRegState(MO.isDef()) | RegState::Implicit | getKillRegState(MO.isKill()) | getDeadRegState(MO.isDead()) | getUndefRegState(MO.isUndef())); } // Change CMP32ri r, 0 back to TEST32rr r, r, etc. switch (DataMI->getOpcode()) { default: break; case X86::CMP64ri32: case X86::CMP64ri8: case X86::CMP32ri: case X86::CMP32ri8: case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP8ri: { MachineOperand &MO0 = DataMI->getOperand(0); MachineOperand &MO1 = DataMI->getOperand(1); if (MO1.getImm() == 0) { unsigned NewOpc; switch (DataMI->getOpcode()) { default: llvm_unreachable("Unreachable!"); case X86::CMP64ri8: case X86::CMP64ri32: NewOpc = X86::TEST64rr; break; case X86::CMP32ri8: case X86::CMP32ri: NewOpc = X86::TEST32rr; break; case X86::CMP16ri8: case X86::CMP16ri: NewOpc = X86::TEST16rr; break; case X86::CMP8ri: NewOpc = X86::TEST8rr; break; } DataMI->setDesc(get(NewOpc)); MO1.ChangeToRegister(MO0.getReg(), false); } } } NewMIs.push_back(DataMI); // Emit the store instruction. if (UnfoldStore) { const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF); std::pair MMOs = MF.extractStoreMemRefs(MI->memoperands_begin(), MI->memoperands_end()); storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs); } return true; } bool X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, SmallVectorImpl &NewNodes) const { if (!N->isMachineOpcode()) return false; DenseMap >::const_iterator I = MemOp2RegOpTable.find(N->getMachineOpcode()); if (I == MemOp2RegOpTable.end()) return false; unsigned Opc = I->second.first; unsigned Index = I->second.second & TB_INDEX_MASK; bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; bool FoldedStore = I->second.second & TB_FOLDED_STORE; const MCInstrDesc &MCID = get(Opc); MachineFunction &MF = DAG.getMachineFunction(); const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF); unsigned NumDefs = MCID.NumDefs; std::vector AddrOps; std::vector BeforeOps; std::vector AfterOps; SDLoc dl(N); unsigned NumOps = N->getNumOperands(); for (unsigned i = 0; i != NumOps-1; ++i) { SDValue Op = N->getOperand(i); if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands) AddrOps.push_back(Op); else if (i < Index-NumDefs) BeforeOps.push_back(Op); else if (i > Index-NumDefs) AfterOps.push_back(Op); } SDValue Chain = N->getOperand(NumOps-1); AddrOps.push_back(Chain); // Emit the load instruction. SDNode *Load = nullptr; if (FoldedLoad) { EVT VT = *RC->vt_begin(); std::pair MMOs = MF.extractLoadMemRefs(cast(N)->memoperands_begin(), cast(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && !Subtarget.isUnalignedMemAccessFast()) // Do not introduce a slow unaligned load. return false; unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl, VT, MVT::Other, AddrOps); NewNodes.push_back(Load); // Preserve memory reference information. cast(Load)->setMemRefs(MMOs.first, MMOs.second); } // Emit the data processing instruction. std::vector VTs; const TargetRegisterClass *DstRC = nullptr; if (MCID.getNumDefs() > 0) { DstRC = getRegClass(MCID, 0, &RI, MF); VTs.push_back(*DstRC->vt_begin()); } for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) { EVT VT = N->getValueType(i); if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs()) VTs.push_back(VT); } if (Load) BeforeOps.push_back(SDValue(Load, 0)); std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps)); SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps); NewNodes.push_back(NewNode); // Emit the store instruction. if (FoldedStore) { AddrOps.pop_back(); AddrOps.push_back(SDValue(NewNode, 0)); AddrOps.push_back(Chain); std::pair MMOs = MF.extractStoreMemRefs(cast(N)->memoperands_begin(), cast(N)->memoperands_end()); if (!(*MMOs.first) && RC == &X86::VR128RegClass && !Subtarget.isUnalignedMemAccessFast()) // Do not introduce a slow unaligned store. return false; unsigned Alignment = RC->getSize() == 32 ? 32 : 16; bool isAligned = (*MMOs.first) && (*MMOs.first)->getAlignment() >= Alignment; SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget), dl, MVT::Other, AddrOps); NewNodes.push_back(Store); // Preserve memory reference information. cast(Load)->setMemRefs(MMOs.first, MMOs.second); } return true; } unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex) const { DenseMap >::const_iterator I = MemOp2RegOpTable.find(Opc); if (I == MemOp2RegOpTable.end()) return 0; bool FoldedLoad = I->second.second & TB_FOLDED_LOAD; bool FoldedStore = I->second.second & TB_FOLDED_STORE; if (UnfoldLoad && !FoldedLoad) return 0; if (UnfoldStore && !FoldedStore) return 0; if (LoadRegIndex) *LoadRegIndex = I->second.second & TB_INDEX_MASK; return I->second.first; } bool X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const { if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) return false; unsigned Opc1 = Load1->getMachineOpcode(); unsigned Opc2 = Load2->getMachineOpcode(); switch (Opc1) { default: return false; case X86::MOV8rm: case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: case X86::LD_Fp32m: case X86::LD_Fp64m: case X86::LD_Fp80m: case X86::MOVSSrm: case X86::MOVSDrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::FsMOVAPSrm: case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: // AVX load instructions case X86::VMOVSSrm: case X86::VMOVSDrm: case X86::FsVMOVAPSrm: case X86::FsVMOVAPDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: break; } switch (Opc2) { default: return false; case X86::MOV8rm: case X86::MOV16rm: case X86::MOV32rm: case X86::MOV64rm: case X86::LD_Fp32m: case X86::LD_Fp64m: case X86::LD_Fp80m: case X86::MOVSSrm: case X86::MOVSDrm: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: case X86::FsMOVAPSrm: case X86::FsMOVAPDrm: case X86::MOVAPSrm: case X86::MOVUPSrm: case X86::MOVAPDrm: case X86::MOVDQArm: case X86::MOVDQUrm: // AVX load instructions case X86::VMOVSSrm: case X86::VMOVSDrm: case X86::FsVMOVAPSrm: case X86::FsVMOVAPDrm: case X86::VMOVAPSrm: case X86::VMOVUPSrm: case X86::VMOVAPDrm: case X86::VMOVDQArm: case X86::VMOVDQUrm: case X86::VMOVAPSYrm: case X86::VMOVUPSYrm: case X86::VMOVAPDYrm: case X86::VMOVDQAYrm: case X86::VMOVDQUYrm: break; } // Check if chain operands and base addresses match. if (Load1->getOperand(0) != Load2->getOperand(0) || Load1->getOperand(5) != Load2->getOperand(5)) return false; // Segment operands should match as well. if (Load1->getOperand(4) != Load2->getOperand(4)) return false; // Scale should be 1, Index should be Reg0. if (Load1->getOperand(1) == Load2->getOperand(1) && Load1->getOperand(2) == Load2->getOperand(2)) { if (cast(Load1->getOperand(1))->getZExtValue() != 1) return false; // Now let's examine the displacements. if (isa(Load1->getOperand(3)) && isa(Load2->getOperand(3))) { Offset1 = cast(Load1->getOperand(3))->getSExtValue(); Offset2 = cast(Load2->getOperand(3))->getSExtValue(); return true; } } return false; } bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const { assert(Offset2 > Offset1); if ((Offset2 - Offset1) / 8 > 64) return false; unsigned Opc1 = Load1->getMachineOpcode(); unsigned Opc2 = Load2->getMachineOpcode(); if (Opc1 != Opc2) return false; // FIXME: overly conservative? switch (Opc1) { default: break; case X86::LD_Fp32m: case X86::LD_Fp64m: case X86::LD_Fp80m: case X86::MMX_MOVD64rm: case X86::MMX_MOVQ64rm: return false; } EVT VT = Load1->getValueType(0); switch (VT.getSimpleVT().SimpleTy) { default: // XMM registers. In 64-bit mode we can be a bit more aggressive since we // have 16 of them to play with. if (Subtarget.is64Bit()) { if (NumLoads >= 3) return false; } else if (NumLoads) { return false; } break; case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: case MVT::f32: case MVT::f64: if (NumLoads) return false; break; } return true; } bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First, MachineInstr *Second) const { // Check if this processor supports macro-fusion. Since this is a minor // heuristic, we haven't specifically reserved a feature. hasAVX is a decent // proxy for SandyBridge+. if (!Subtarget.hasAVX()) return false; enum { FuseTest, FuseCmp, FuseInc } FuseKind; switch(Second->getOpcode()) { default: return false; case X86::JE_1: case X86::JNE_1: case X86::JL_1: case X86::JLE_1: case X86::JG_1: case X86::JGE_1: FuseKind = FuseInc; break; case X86::JB_1: case X86::JBE_1: case X86::JA_1: case X86::JAE_1: FuseKind = FuseCmp; break; case X86::JS_1: case X86::JNS_1: case X86::JP_1: case X86::JNP_1: case X86::JO_1: case X86::JNO_1: FuseKind = FuseTest; break; } switch (First->getOpcode()) { default: return false; case X86::TEST8rr: case X86::TEST16rr: case X86::TEST32rr: case X86::TEST64rr: case X86::TEST8ri: case X86::TEST16ri: case X86::TEST32ri: case X86::TEST32i32: case X86::TEST64i32: case X86::TEST64ri32: case X86::TEST8rm: case X86::TEST16rm: case X86::TEST32rm: case X86::TEST64rm: case X86::TEST8ri_NOREX: case X86::AND16i16: case X86::AND16ri: case X86::AND16ri8: case X86::AND16rm: case X86::AND16rr: case X86::AND32i32: case X86::AND32ri: case X86::AND32ri8: case X86::AND32rm: case X86::AND32rr: case X86::AND64i32: case X86::AND64ri32: case X86::AND64ri8: case X86::AND64rm: case X86::AND64rr: case X86::AND8i8: case X86::AND8ri: case X86::AND8rm: case X86::AND8rr: return true; case X86::CMP16i16: case X86::CMP16ri: case X86::CMP16ri8: case X86::CMP16rm: case X86::CMP16rr: case X86::CMP32i32: case X86::CMP32ri: case X86::CMP32ri8: case X86::CMP32rm: case X86::CMP32rr: case X86::CMP64i32: case X86::CMP64ri32: case X86::CMP64ri8: case X86::CMP64rm: case X86::CMP64rr: case X86::CMP8i8: case X86::CMP8ri: case X86::CMP8rm: case X86::CMP8rr: case X86::ADD16i16: case X86::ADD16ri: case X86::ADD16ri8: case X86::ADD16ri8_DB: case X86::ADD16ri_DB: case X86::ADD16rm: case X86::ADD16rr: case X86::ADD16rr_DB: case X86::ADD32i32: case X86::ADD32ri: case X86::ADD32ri8: case X86::ADD32ri8_DB: case X86::ADD32ri_DB: case X86::ADD32rm: case X86::ADD32rr: case X86::ADD32rr_DB: case X86::ADD64i32: case X86::ADD64ri32: case X86::ADD64ri32_DB: case X86::ADD64ri8: case X86::ADD64ri8_DB: case X86::ADD64rm: case X86::ADD64rr: case X86::ADD64rr_DB: case X86::ADD8i8: case X86::ADD8mi: case X86::ADD8mr: case X86::ADD8ri: case X86::ADD8rm: case X86::ADD8rr: case X86::SUB16i16: case X86::SUB16ri: case X86::SUB16ri8: case X86::SUB16rm: case X86::SUB16rr: case X86::SUB32i32: case X86::SUB32ri: case X86::SUB32ri8: case X86::SUB32rm: case X86::SUB32rr: case X86::SUB64i32: case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB64rm: case X86::SUB64rr: case X86::SUB8i8: case X86::SUB8ri: case X86::SUB8rm: case X86::SUB8rr: return FuseKind == FuseCmp || FuseKind == FuseInc; case X86::INC16r: case X86::INC32r: case X86::INC64r: case X86::INC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r: case X86::DEC8r: return FuseKind == FuseInc; } } bool X86InstrInfo:: ReverseBranchCondition(SmallVectorImpl &Cond) const { assert(Cond.size() == 1 && "Invalid X86 branch condition!"); X86::CondCode CC = static_cast(Cond[0].getImm()); if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E) return true; Cond[0].setImm(GetOppositeBranchCondition(CC)); return false; } bool X86InstrInfo:: isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { // FIXME: Return false for x87 stack register classes for now. We can't // allow any loads of these registers before FpGet_ST0_80. return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass); } /// getGlobalBaseReg - Return a virtual register initialized with the /// the global base register value. Output instructions required to /// initialize the register in the function entry block, if necessary. /// /// TODO: Eliminate this and move the code to X86MachineFunctionInfo. /// unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const { assert(!Subtarget.is64Bit() && "X86-64 PIC uses RIP relative addressing"); X86MachineFunctionInfo *X86FI = MF->getInfo(); unsigned GlobalBaseReg = X86FI->getGlobalBaseReg(); if (GlobalBaseReg != 0) return GlobalBaseReg; // Create the register. The code to initialize it is inserted // later, by the CGBR pass (below). MachineRegisterInfo &RegInfo = MF->getRegInfo(); GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass); X86FI->setGlobalBaseReg(GlobalBaseReg); return GlobalBaseReg; } // These are the replaceable SSE instructions. Some of these have Int variants // that we don't include here. We don't want to replace instructions selected // by intrinsics. static const uint16_t ReplaceableInstrs[][3] = { //PackedSingle PackedDouble PackedInt { X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr }, { X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm }, { X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr }, { X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr }, { X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm }, { X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr }, { X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm }, { X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr }, { X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm }, { X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr }, { X86::ORPSrm, X86::ORPDrm, X86::PORrm }, { X86::ORPSrr, X86::ORPDrr, X86::PORrr }, { X86::XORPSrm, X86::XORPDrm, X86::PXORrm }, { X86::XORPSrr, X86::XORPDrr, X86::PXORrr }, // AVX 128-bit support { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr }, { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm }, { X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr }, { X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr }, { X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm }, { X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr }, { X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm }, { X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr }, { X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm }, { X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr }, { X86::VORPSrm, X86::VORPDrm, X86::VPORrm }, { X86::VORPSrr, X86::VORPDrr, X86::VPORrr }, { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm }, { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr }, // AVX 256-bit support { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr }, { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm }, { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr }, { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr } }; static const uint16_t ReplaceableInstrsAVX2[][3] = { //PackedSingle PackedDouble PackedInt { X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm }, { X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr }, { X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm }, { X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr }, { X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm }, { X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr }, { X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm }, { X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr }, { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr }, { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr }, { X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm }, { X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr }, { X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm }, { X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr }, { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm}, { X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr}, { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr}, { X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm}, { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr}, { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm} }; // FIXME: Some shuffle and unpack instructions have equivalents in different // domains, but they require a bit more work than just switching opcodes. static const uint16_t *lookup(unsigned opcode, unsigned domain) { for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i) if (ReplaceableInstrs[i][domain-1] == opcode) return ReplaceableInstrs[i]; return nullptr; } static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i) if (ReplaceableInstrsAVX2[i][domain-1] == opcode) return ReplaceableInstrsAVX2[i]; return nullptr; } std::pair X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const { uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; bool hasAVX2 = Subtarget.hasAVX2(); uint16_t validDomains = 0; if (domain && lookup(MI->getOpcode(), domain)) validDomains = 0xe; else if (domain && lookupAVX2(MI->getOpcode(), domain)) validDomains = hasAVX2 ? 0xe : 0x6; return std::make_pair(domain, validDomains); } void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const { assert(Domain>0 && Domain<4 && "Invalid execution domain"); uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3; assert(dom && "Not an SSE instruction"); const uint16_t *table = lookup(MI->getOpcode(), dom); if (!table) { // try the other table assert((Subtarget.hasAVX2() || Domain < 3) && "256-bit vector operations only available in AVX2"); table = lookupAVX2(MI->getOpcode(), dom); } assert(table && "Cannot change domain"); MI->setDesc(get(table[Domain-1])); } /// getNoopForMachoTarget - Return the noop instruction to use for a noop. void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const { NopInst.setOpcode(X86::NOOP); } // This code must remain in sync with getJumpInstrTableEntryBound in this class! // In particular, getJumpInstrTableEntryBound must always return an upper bound // on the encoding lengths of the instructions generated by // getUnconditionalBranch and getTrap. void X86InstrInfo::getUnconditionalBranch( MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const { Branch.setOpcode(X86::JMP_1); Branch.addOperand(MCOperand::CreateExpr(BranchTarget)); } // This code must remain in sync with getJumpInstrTableEntryBound in this class! // In particular, getJumpInstrTableEntryBound must always return an upper bound // on the encoding lengths of the instructions generated by // getUnconditionalBranch and getTrap. void X86InstrInfo::getTrap(MCInst &MI) const { MI.setOpcode(X86::TRAP); } // See getTrap and getUnconditionalBranch for conditions on the value returned // by this function. unsigned X86InstrInfo::getJumpInstrTableEntryBound() const { // 5 bytes suffice: JMP_4 Symbol@PLT is uses 1 byte (E9) for the JMP_4 and 4 // bytes for the symbol offset. And TRAP is ud2, which is two bytes (0F 0B). return 5; } bool X86InstrInfo::isHighLatencyDef(int opc) const { switch (opc) { default: return false; case X86::DIVSDrm: case X86::DIVSDrm_Int: case X86::DIVSDrr: case X86::DIVSDrr_Int: case X86::DIVSSrm: case X86::DIVSSrm_Int: case X86::DIVSSrr: case X86::DIVSSrr_Int: case X86::SQRTPDm: case X86::SQRTPDr: case X86::SQRTPSm: case X86::SQRTPSr: case X86::SQRTSDm: case X86::SQRTSDm_Int: case X86::SQRTSDr: case X86::SQRTSDr_Int: case X86::SQRTSSm: case X86::SQRTSSm_Int: case X86::SQRTSSr: case X86::SQRTSSr_Int: // AVX instructions with high latency case X86::VDIVSDrm: case X86::VDIVSDrm_Int: case X86::VDIVSDrr: case X86::VDIVSDrr_Int: case X86::VDIVSSrm: case X86::VDIVSSrm_Int: case X86::VDIVSSrr: case X86::VDIVSSrr_Int: case X86::VSQRTPDm: case X86::VSQRTPDr: case X86::VSQRTPSm: case X86::VSQRTPSr: case X86::VSQRTSDm: case X86::VSQRTSDm_Int: case X86::VSQRTSDr: case X86::VSQRTSSm: case X86::VSQRTSSm_Int: case X86::VSQRTSSr: case X86::VSQRTPDZm: case X86::VSQRTPDZr: case X86::VSQRTPSZm: case X86::VSQRTPSZr: case X86::VSQRTSDZm: case X86::VSQRTSDZm_Int: case X86::VSQRTSDZr: case X86::VSQRTSSZm_Int: case X86::VSQRTSSZr: case X86::VSQRTSSZm: case X86::VDIVSDZrm: case X86::VDIVSDZrr: case X86::VDIVSSZrm: case X86::VDIVSSZrr: case X86::VGATHERQPSZrm: case X86::VGATHERQPDZrm: case X86::VGATHERDPDZrm: case X86::VGATHERDPSZrm: case X86::VPGATHERQDZrm: case X86::VPGATHERQQZrm: case X86::VPGATHERDDZrm: case X86::VPGATHERDQZrm: case X86::VSCATTERQPDZmr: case X86::VSCATTERQPSZmr: case X86::VSCATTERDPDZmr: case X86::VSCATTERDPSZmr: case X86::VPSCATTERQDZmr: case X86::VPSCATTERQQZmr: case X86::VPSCATTERDDZmr: case X86::VPSCATTERDQZmr: return true; } } bool X86InstrInfo:: hasHighOperandLatency(const InstrItineraryData *ItinData, const MachineRegisterInfo *MRI, const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const { return isHighLatencyDef(DefMI->getOpcode()); } namespace { /// CGBR - Create Global Base Reg pass. This initializes the PIC /// global base register for x86-32. struct CGBR : public MachineFunctionPass { static char ID; CGBR() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { const X86TargetMachine *TM = static_cast(&MF.getTarget()); // Don't do anything if this is 64-bit as 64-bit PIC // uses RIP relative addressing. if (TM->getSubtarget().is64Bit()) return false; // Only emit a global base reg in PIC mode. if (TM->getRelocationModel() != Reloc::PIC_) return false; X86MachineFunctionInfo *X86FI = MF.getInfo(); unsigned GlobalBaseReg = X86FI->getGlobalBaseReg(); // If we didn't need a GlobalBaseReg, don't insert code. if (GlobalBaseReg == 0) return false; // Insert the set of GlobalBaseReg into the first MBB of the function MachineBasicBlock &FirstMBB = MF.front(); MachineBasicBlock::iterator MBBI = FirstMBB.begin(); DebugLoc DL = FirstMBB.findDebugLoc(MBBI); MachineRegisterInfo &RegInfo = MF.getRegInfo(); const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); unsigned PC; if (TM->getSubtarget().isPICStyleGOT()) PC = RegInfo.createVirtualRegister(&X86::GR32RegClass); else PC = GlobalBaseReg; // Operand of MovePCtoStack is completely ignored by asm printer. It's // only used in JIT code emission as displacement to pc. BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0); // If we're using vanilla 'GOT' PIC style, we should use relative addressing // not to pc, but to _GLOBAL_OFFSET_TABLE_ external. if (TM->getSubtarget().isPICStyleGOT()) { // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg) .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_", X86II::MO_GOT_ABSOLUTE_ADDRESS); } return true; } const char *getPassName() const override { return "X86 PIC Global Base Reg Initialization"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); MachineFunctionPass::getAnalysisUsage(AU); } }; } char CGBR::ID = 0; FunctionPass* llvm::createX86GlobalBaseRegPass() { return new CGBR(); } namespace { struct LDTLSCleanup : public MachineFunctionPass { static char ID; LDTLSCleanup() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { X86MachineFunctionInfo* MFI = MF.getInfo(); if (MFI->getNumLocalDynamicTLSAccesses() < 2) { // No point folding accesses if there isn't at least two. return false; } MachineDominatorTree *DT = &getAnalysis(); return VisitNode(DT->getRootNode(), 0); } // Visit the dominator subtree rooted at Node in pre-order. // If TLSBaseAddrReg is non-null, then use that to replace any // TLS_base_addr instructions. Otherwise, create the register // when the first such instruction is seen, and then use it // as we encounter more instructions. bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) { MachineBasicBlock *BB = Node->getBlock(); bool Changed = false; // Traverse the current block. for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { switch (I->getOpcode()) { case X86::TLS_base_addr32: case X86::TLS_base_addr64: if (TLSBaseAddrReg) I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg); else I = SetRegister(I, &TLSBaseAddrReg); Changed = true; break; default: break; } } // Visit the children of this block in the dominator tree. for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end(); I != E; ++I) { Changed |= VisitNode(*I, TLSBaseAddrReg); } return Changed; } // Replace the TLS_base_addr instruction I with a copy from // TLSBaseAddrReg, returning the new instruction. MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I, unsigned TLSBaseAddrReg) { MachineFunction *MF = I->getParent()->getParent(); const X86TargetMachine *TM = static_cast(&MF->getTarget()); const bool is64Bit = TM->getSubtarget().is64Bit(); const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); // Insert a Copy from TLSBaseAddrReg to RAX/EAX. MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX) .addReg(TLSBaseAddrReg); // Erase the TLS_base_addr instruction. I->eraseFromParent(); return Copy; } // Create a virtal register in *TLSBaseAddrReg, and populate it by // inserting a copy instruction after I. Returns the new instruction. MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { MachineFunction *MF = I->getParent()->getParent(); const X86TargetMachine *TM = static_cast(&MF->getTarget()); const bool is64Bit = TM->getSubtarget().is64Bit(); const X86InstrInfo *TII = TM->getSubtargetImpl()->getInstrInfo(); // Create a virtual register for the TLS base address. MachineRegisterInfo &RegInfo = MF->getRegInfo(); *TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit ? &X86::GR64RegClass : &X86::GR32RegClass); // Insert a copy from RAX/EAX to TLSBaseAddrReg. MachineInstr *Next = I->getNextNode(); MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(), TII->get(TargetOpcode::COPY), *TLSBaseAddrReg) .addReg(is64Bit ? X86::RAX : X86::EAX); return Copy; } const char *getPassName() const override { return "Local Dynamic TLS Access Clean-up"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } }; } char LDTLSCleanup::ID = 0; FunctionPass* llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); } diff --git a/contrib/llvm/lib/Target/X86/X86InstrInfo.h b/contrib/llvm/lib/Target/X86/X86InstrInfo.h index 5662e86932c2..4d15467f0ca3 100644 --- a/contrib/llvm/lib/Target/X86/X86InstrInfo.h +++ b/contrib/llvm/lib/Target/X86/X86InstrInfo.h @@ -1,468 +1,473 @@ //===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file contains the X86 implementation of the TargetInstrInfo class. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H #define LLVM_LIB_TARGET_X86_X86INSTRINFO_H #include "MCTargetDesc/X86BaseInfo.h" #include "X86RegisterInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/Target/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER #include "X86GenInstrInfo.inc" namespace llvm { class X86RegisterInfo; class X86Subtarget; namespace X86 { // X86 specific condition code. These correspond to X86_*_COND in // X86InstrInfo.td. They must be kept in synch. enum CondCode { COND_A = 0, COND_AE = 1, COND_B = 2, COND_BE = 3, COND_E = 4, COND_G = 5, COND_GE = 6, COND_L = 7, COND_LE = 8, COND_NE = 9, COND_NO = 10, COND_NP = 11, COND_NS = 12, COND_O = 13, COND_P = 14, COND_S = 15, LAST_VALID_COND = COND_S, // Artificial condition codes. These are used by AnalyzeBranch // to indicate a block terminated with two conditional branches to // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE, // which can't be represented on x86 with a single condition. These // are never used in MachineInstrs. COND_NE_OR_P, COND_NP_OR_E, COND_INVALID }; // Turn condition code into conditional branch opcode. unsigned GetCondBranchFromCond(CondCode CC); /// \brief Return a set opcode for the given condition and whether it has /// a memory operand. unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); /// \brief Return a cmov opcode for the given condition, register size in /// bytes, and operand type. unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, bool HasMemoryOperand = false); // Turn CMov opcode into condition code. CondCode getCondFromCMovOpc(unsigned Opc); /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. CondCode GetOppositeBranchCondition(CondCode CC); } // end namespace X86; /// isGlobalStubReference - Return true if the specified TargetFlag operand is /// a reference to a stub for a global, not the global itself. inline static bool isGlobalStubReference(unsigned char TargetFlag) { switch (TargetFlag) { case X86II::MO_DLLIMPORT: // dllimport stub. case X86II::MO_GOTPCREL: // rip-relative GOT reference. case X86II::MO_GOT: // normal GOT reference. case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref. case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref. case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref. return true; default: return false; } } /// isGlobalRelativeToPICBase - Return true if the specified global value /// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this /// is true, the addressing mode has the PIC base register added in (e.g. EBX). inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) { switch (TargetFlag) { case X86II::MO_GOTOFF: // isPICStyleGOT: local global. case X86II::MO_GOT: // isPICStyleGOT: other global. case X86II::MO_PIC_BASE_OFFSET: // Darwin local global. case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global. case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global. case X86II::MO_TLVP: // ??? Pretty sure.. return true; default: return false; } } inline static bool isScale(const MachineOperand &MO) { return MO.isImm() && (MO.getImm() == 1 || MO.getImm() == 2 || MO.getImm() == 4 || MO.getImm() == 8); } inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) { if (MI->getOperand(Op).isFI()) return true; return Op+X86::AddrSegmentReg <= MI->getNumOperands() && MI->getOperand(Op+X86::AddrBaseReg).isReg() && isScale(MI->getOperand(Op+X86::AddrScaleAmt)) && MI->getOperand(Op+X86::AddrIndexReg).isReg() && (MI->getOperand(Op+X86::AddrDisp).isImm() || MI->getOperand(Op+X86::AddrDisp).isGlobal() || MI->getOperand(Op+X86::AddrDisp).isCPI() || MI->getOperand(Op+X86::AddrDisp).isJTI()); } inline static bool isMem(const MachineInstr *MI, unsigned Op) { if (MI->getOperand(Op).isFI()) return true; return Op+X86::AddrNumOperands <= MI->getNumOperands() && MI->getOperand(Op+X86::AddrSegmentReg).isReg() && isLeaMem(MI, Op); } class X86InstrInfo final : public X86GenInstrInfo { X86Subtarget &Subtarget; const X86RegisterInfo RI; /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps. /// typedef DenseMap > RegOp2MemOpTableType; RegOp2MemOpTableType RegOp2MemOpTable2Addr; RegOp2MemOpTableType RegOp2MemOpTable0; RegOp2MemOpTableType RegOp2MemOpTable1; RegOp2MemOpTableType RegOp2MemOpTable2; RegOp2MemOpTableType RegOp2MemOpTable3; RegOp2MemOpTableType RegOp2MemOpTable4; /// MemOp2RegOpTable - Load / store unfolding opcode map. /// typedef DenseMap > MemOp2RegOpTableType; MemOp2RegOpTableType MemOp2RegOpTable; static void AddTableEntry(RegOp2MemOpTableType &R2MTable, MemOp2RegOpTableType &M2RTable, unsigned RegOp, unsigned MemOp, unsigned Flags); virtual void anchor(); public: explicit X86InstrInfo(X86Subtarget &STI); /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As /// such, whenever a client has an instance of instruction info, it should /// always be able to get register info as well (through this method). /// const X86RegisterInfo &getRegisterInfo() const { return RI; } + /// getSPAdjust - This returns the stack pointer adjustment made by + /// this instruction. For x86, we need to handle more complex call + /// sequences involving PUSHes. + int getSPAdjust(const MachineInstr *MI) const override; + /// isCoalescableExtInstr - Return true if the instruction is a "coalescable" /// extension instruction. That is, it's like a copy where it's legal for the /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns /// true, then it's expected the pre-extension value is available as a subreg /// of the result register. This also returns the sub-register index in /// SubIdx. bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg, unsigned &SubIdx) const override; unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const override; /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination /// stack locations as well. This uses a heuristic so it isn't /// reliable for correctness. unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI, int &FrameIndex) const override; unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const override; /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination /// stack locations as well. This uses a heuristic so it isn't /// reliable for correctness. unsigned isStoreToStackSlotPostFE(const MachineInstr *MI, int &FrameIndex) const override; bool isReallyTriviallyReMaterializable(const MachineInstr *MI, AliasAnalysis *AA) const override; void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig, const TargetRegisterInfo &TRI) const override; /// Given an operand within a MachineInstr, insert preceding code to put it /// into the right format for a particular kind of LEA instruction. This may /// involve using an appropriate super-register instead (with an implicit use /// of the original) or creating a new virtual register and inserting COPY /// instructions to get the data into the right class. /// /// Reference parameters are set to indicate how caller should add this /// operand to the LEA instruction. bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc, bool &isKill, bool &isUndef, MachineOperand &ImplicitOp) const; /// convertToThreeAddress - This method must be implemented by targets that /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target /// may be able to convert a two-address instruction into a true /// three-address instruction on demand. This allows the X86 target (for /// example) to convert ADD and SHL instructions into LEA instructions if they /// would require register copies due to two-addressness. /// /// This method returns a null pointer if the transformation cannot be /// performed, otherwise it returns the new instruction. /// MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const override; /// commuteInstruction - We have a few instructions that must be hacked on to /// commute them. /// MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override; bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2) const override; // Branch analysis. bool isUnpredicatedTerminator(const MachineInstr* MI) const override; bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, bool AllowModify) const override; unsigned RemoveBranch(MachineBasicBlock &MBB) const override; unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl &Cond, DebugLoc DL) const override; bool canInsertSelect(const MachineBasicBlock&, const SmallVectorImpl &Cond, unsigned, unsigned, int&, int&, int&) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, unsigned DstReg, const SmallVectorImpl &Cond, unsigned TrueReg, unsigned FalseReg) const override; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill, SmallVectorImpl &Addr, const TargetRegisterClass *RC, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl &NewMIs) const; void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, SmallVectorImpl &Addr, const TargetRegisterClass *RC, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, SmallVectorImpl &NewMIs) const; bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override; /// foldMemoryOperand - If this target supports it, fold a load or store of /// the specified stack slot into the specified machine instruction for the /// specified operand(s). If this is possible, the target should perform the /// folding and return true, otherwise it should return false. If it folds /// the instruction, it is likely that the MachineInstruction the iterator /// references has been changed. MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI, const SmallVectorImpl &Ops, int FrameIndex) const override; /// foldMemoryOperand - Same as the previous version except it allows folding /// of any load and store from / to any address, not just from a specific /// stack slot. MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI, const SmallVectorImpl &Ops, MachineInstr* LoadMI) const override; /// canFoldMemoryOperand - Returns true if the specified load / store is /// folding is possible. bool canFoldMemoryOperand(const MachineInstr*, const SmallVectorImpl &) const override; /// unfoldMemoryOperand - Separate a single instruction which folded a load or /// a store or a load and a store into two or more instruction. If this is /// possible, returns true as well as the new instructions by reference. bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI, unsigned Reg, bool UnfoldLoad, bool UnfoldStore, SmallVectorImpl &NewMIs) const override; bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, SmallVectorImpl &NewNodes) const override; /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new /// instruction after load / store are unfolded from an instruction of the /// specified opcode. It returns zero if the specified unfolding is not /// possible. If LoadRegIndex is non-null, it is filled in with the operand /// index of the operand which will hold the register holding the loaded /// value. unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, unsigned *LoadRegIndex = nullptr) const override; /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler /// to determine if two loads are loading from the same base address. It /// should only return true if the base pointers are the same and the /// only differences between the two addresses are the offset. It also returns /// the offsets by reference. bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1, int64_t &Offset2) const override; /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should /// be scheduled togther. On some targets if two loads are loading from /// addresses in the same cache line, it's better if they are scheduled /// together. This function takes two integers that represent the load offsets /// from the common base address. It returns true if it decides it's desirable /// to schedule the two loads together. "NumLoads" is the number of loads that /// have already been scheduled after Load1. bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, int64_t Offset2, unsigned NumLoads) const override; bool shouldScheduleAdjacent(MachineInstr* First, MachineInstr *Second) const override; void getNoopForMachoTarget(MCInst &NopInst) const override; bool ReverseBranchCondition(SmallVectorImpl &Cond) const override; /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine /// instruction that defines the specified register class. bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override; /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha /// would clobber the EFLAGS condition register. Note the result may be /// conservative. If it cannot definitely determine the safety after visiting /// a few instructions in each direction it assumes it's not safe. bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const; static bool isX86_64ExtendedReg(const MachineOperand &MO) { if (!MO.isReg()) return false; return X86II::isX86_64ExtendedReg(MO.getReg()); } /// getGlobalBaseReg - Return a virtual register initialized with the /// the global base register value. Output instructions required to /// initialize the register in the function entry block, if necessary. /// unsigned getGlobalBaseReg(MachineFunction *MF) const; std::pair getExecutionDomain(const MachineInstr *MI) const override; void setExecutionDomain(MachineInstr *MI, unsigned Domain) const override; unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override; unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum, const TargetRegisterInfo *TRI) const override; void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum, const TargetRegisterInfo *TRI) const override; MachineInstr* foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI, unsigned OpNum, const SmallVectorImpl &MOs, unsigned Size, unsigned Alignment, bool AllowCommute) const; void getUnconditionalBranch(MCInst &Branch, const MCSymbolRefExpr *BranchTarget) const override; void getTrap(MCInst &MI) const override; unsigned getJumpInstrTableEntryBound() const override; bool isHighLatencyDef(int opc) const override; bool hasHighOperandLatency(const InstrItineraryData *ItinData, const MachineRegisterInfo *MRI, const MachineInstr *DefMI, unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const override; /// analyzeCompare - For a comparison instruction, return the source registers /// in SrcReg and SrcReg2 if having two register operands, and the value it /// compares against in CmpValue. Return true if the comparison instruction /// can be analyzed. bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const override; /// optimizeCompareInstr - Check if there exists an earlier instruction that /// operates on the same source operands and sets flags in the same way as /// Compare; remove Compare if possible. bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; /// optimizeLoadInstr - Try to remove the load by folding it to a register /// operand at the use. We fold the load instructions if and only if the /// def and use are in the same BB. We only look at one load and see /// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register /// defined by the load we are trying to fold. DefMI returns the machine /// instruction that defines FoldAsLoadDefReg, and the function returns /// the machine instruction generated due to folding. MachineInstr* optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI, unsigned &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; private: MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc, MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI, LiveVariables *LV) const; /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. bool isFrameOperand(const MachineInstr *MI, unsigned int Op, int &FrameIndex) const; }; } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h index b23a744da686..9fd03a7059cf 100644 --- a/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/contrib/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -1,168 +1,176 @@ //===-- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file declares X86-specific per-machine-function information. // //===----------------------------------------------------------------------===// #ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H #define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineValueType.h" #include namespace llvm { /// X86MachineFunctionInfo - This class is derived from MachineFunction and /// contains private X86 target-specific information for each MachineFunction. class X86MachineFunctionInfo : public MachineFunctionInfo { virtual void anchor(); /// ForceFramePointer - True if the function is required to use of frame /// pointer for reasons other than it containing dynamic allocation or /// that FP eliminatation is turned off. For example, Cygwin main function /// contains stack pointer re-alignment code which requires FP. bool ForceFramePointer; /// RestoreBasePointerOffset - Non-zero if the function has base pointer /// and makes call to llvm.eh.sjlj.setjmp. When non-zero, the value is a /// displacement from the frame pointer to a slot where the base pointer /// is stashed. signed char RestoreBasePointerOffset; /// CalleeSavedFrameSize - Size of the callee-saved register portion of the /// stack frame in bytes. unsigned CalleeSavedFrameSize; /// BytesToPopOnReturn - Number of bytes function pops on return (in addition /// to the space used by the return address). /// Used on windows platform for stdcall & fastcall name decoration unsigned BytesToPopOnReturn; /// ReturnAddrIndex - FrameIndex for return slot. int ReturnAddrIndex; /// TailCallReturnAddrDelta - The number of bytes by which return address /// stack slot is moved as the result of tail call optimization. int TailCallReturnAddrDelta; /// SRetReturnReg - Some subtargets require that sret lowering includes /// returning the value of the returned struct in a register. This field /// holds the virtual register into which the sret argument is passed. unsigned SRetReturnReg; /// GlobalBaseReg - keeps track of the virtual register initialized for /// use as the global base register. This is used for PIC in some PIC /// relocation models. unsigned GlobalBaseReg; /// VarArgsFrameIndex - FrameIndex for start of varargs area. int VarArgsFrameIndex; /// RegSaveFrameIndex - X86-64 vararg func register save area. int RegSaveFrameIndex; /// VarArgsGPOffset - X86-64 vararg func int reg offset. unsigned VarArgsGPOffset; /// VarArgsFPOffset - X86-64 vararg func fp reg offset. unsigned VarArgsFPOffset; /// ArgumentStackSize - The number of bytes on stack consumed by the arguments /// being passed on the stack. unsigned ArgumentStackSize; /// NumLocalDynamics - Number of local-dynamic TLS accesses. unsigned NumLocalDynamics; + /// HasPushSequences - Keeps track of whether this function uses sequences + /// of pushes to pass function parameters. + bool HasPushSequences; private: /// ForwardedMustTailRegParms - A list of virtual and physical registers /// that must be forwarded to every musttail call. SmallVector ForwardedMustTailRegParms; public: X86MachineFunctionInfo() : ForceFramePointer(false), RestoreBasePointerOffset(0), CalleeSavedFrameSize(0), BytesToPopOnReturn(0), ReturnAddrIndex(0), TailCallReturnAddrDelta(0), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0), RegSaveFrameIndex(0), VarArgsGPOffset(0), VarArgsFPOffset(0), ArgumentStackSize(0), - NumLocalDynamics(0) {} + NumLocalDynamics(0), + HasPushSequences(false) {} explicit X86MachineFunctionInfo(MachineFunction &MF) : ForceFramePointer(false), RestoreBasePointerOffset(0), CalleeSavedFrameSize(0), BytesToPopOnReturn(0), ReturnAddrIndex(0), TailCallReturnAddrDelta(0), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0), RegSaveFrameIndex(0), VarArgsGPOffset(0), VarArgsFPOffset(0), ArgumentStackSize(0), - NumLocalDynamics(0) {} + NumLocalDynamics(0), + HasPushSequences(false) {} bool getForceFramePointer() const { return ForceFramePointer;} void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; } + bool getHasPushSequences() const { return HasPushSequences; } + void setHasPushSequences(bool HasPush) { HasPushSequences = HasPush; } + bool getRestoreBasePointer() const { return RestoreBasePointerOffset!=0; } void setRestoreBasePointer(const MachineFunction *MF); int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; } unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; } void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; } unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; } void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;} int getRAIndex() const { return ReturnAddrIndex; } void setRAIndex(int Index) { ReturnAddrIndex = Index; } int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; } void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;} unsigned getSRetReturnReg() const { return SRetReturnReg; } void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; } unsigned getGlobalBaseReg() const { return GlobalBaseReg; } void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; } int getVarArgsFrameIndex() const { return VarArgsFrameIndex; } void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; } int getRegSaveFrameIndex() const { return RegSaveFrameIndex; } void setRegSaveFrameIndex(int Idx) { RegSaveFrameIndex = Idx; } unsigned getVarArgsGPOffset() const { return VarArgsGPOffset; } void setVarArgsGPOffset(unsigned Offset) { VarArgsGPOffset = Offset; } unsigned getVarArgsFPOffset() const { return VarArgsFPOffset; } void setVarArgsFPOffset(unsigned Offset) { VarArgsFPOffset = Offset; } unsigned getArgumentStackSize() const { return ArgumentStackSize; } void setArgumentStackSize(unsigned size) { ArgumentStackSize = size; } unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; } SmallVectorImpl &getForwardedMustTailRegParms() { return ForwardedMustTailRegParms; } }; } // End llvm namespace #endif diff --git a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp index 09e651cebfb9..0fa38f453706 100644 --- a/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/contrib/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1,730 +1,731 @@ //===-- X86RegisterInfo.cpp - X86 Register Information --------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file contains the X86 implementation of the TargetRegisterInfo class. // This file is responsible for the frame pointer elimination optimization // on X86. // //===----------------------------------------------------------------------===// #include "X86RegisterInfo.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetFrameLowering.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "X86GenRegisterInfo.inc" cl::opt ForceStackAlign("force-align-stack", cl::desc("Force align the stack to the minimum alignment" " needed for the function."), cl::init(false), cl::Hidden); static cl::opt EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); X86RegisterInfo::X86RegisterInfo(const X86Subtarget &STI) : X86GenRegisterInfo( (STI.is64Bit() ? X86::RIP : X86::EIP), X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), false), X86_MC::getDwarfRegFlavour(STI.getTargetTriple(), true), (STI.is64Bit() ? X86::RIP : X86::EIP)), Subtarget(STI) { X86_MC::InitLLVM2SEHRegisterMapping(this); // Cache some information. Is64Bit = Subtarget.is64Bit(); IsWin64 = Subtarget.isTargetWin64(); if (Is64Bit) { SlotSize = 8; StackPtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ? X86::RSP : X86::ESP; FramePtr = (Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64()) ? X86::RBP : X86::EBP; } else { SlotSize = 4; StackPtr = X86::ESP; FramePtr = X86::EBP; } // Use a callee-saved register as the base pointer. These registers must // not conflict with any ABI requirements. For example, in 32-bit mode PIC // requires GOT in the EBX register before function calls via PLT GOT pointer. BasePtr = Is64Bit ? X86::RBX : X86::ESI; } bool X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const { // ExeDepsFixer and PostRAScheduler require liveness. return true; } int X86RegisterInfo::getSEHRegNum(unsigned i) const { return getEncodingValue(i); } const TargetRegisterClass * X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const { // The sub_8bit sub-register index is more constrained in 32-bit mode. // It behaves just like the sub_8bit_hi index. if (!Is64Bit && Idx == X86::sub_8bit) Idx = X86::sub_8bit_hi; // Forward to TableGen's default version. return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx); } const TargetRegisterClass * X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned SubIdx) const { // The sub_8bit sub-register index is more constrained in 32-bit mode. if (!Is64Bit && SubIdx == X86::sub_8bit) { A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi); if (!A) return nullptr; } return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx); } const TargetRegisterClass* X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{ // Don't allow super-classes of GR8_NOREX. This class is only used after // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied // to the full GR8 register class in 64-bit mode, so we cannot allow the // reigster class inflation. // // The GR8_NOREX class is always used in a way that won't be constrained to a // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the // full GR8 class. if (RC == &X86::GR8_NOREXRegClass) return RC; const TargetRegisterClass *Super = RC; TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); do { switch (Super->getID()) { case X86::GR8RegClassID: case X86::GR16RegClassID: case X86::GR32RegClassID: case X86::GR64RegClassID: case X86::FR32RegClassID: case X86::FR64RegClassID: case X86::RFP32RegClassID: case X86::RFP64RegClassID: case X86::RFP80RegClassID: case X86::VR128RegClassID: case X86::VR256RegClassID: // Don't return a super-class that would shrink the spill size. // That can happen with the vector and float classes. if (Super->getSize() == RC->getSize()) return Super; } Super = *I++; } while (Super); return RC; } const TargetRegisterClass * X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { switch (Kind) { default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); case 0: // Normal GPRs. if (Subtarget.isTarget64BitLP64()) return &X86::GR64RegClass; return &X86::GR32RegClass; case 1: // Normal GPRs except the stack pointer (for encoding reasons). if (Subtarget.isTarget64BitLP64()) return &X86::GR64_NOSPRegClass; return &X86::GR32_NOSPRegClass; case 2: // Available for tailcall (not callee-saved GPRs). if (Subtarget.isTargetWin64()) return &X86::GR64_TCW64RegClass; else if (Subtarget.is64Bit()) return &X86::GR64_TCRegClass; const Function *F = MF.getFunction(); bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); if (hasHipeCC) return &X86::GR32RegClass; return &X86::GR32_TCRegClass; } } const TargetRegisterClass * X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { if (RC == &X86::CCRRegClass) { if (Is64Bit) return &X86::GR64RegClass; else return &X86::GR32RegClass; } return RC; } unsigned X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; switch (RC->getID()) { default: return 0; case X86::GR32RegClassID: return 4 - FPDiff; case X86::GR64RegClassID: return 12 - FPDiff; case X86::VR128RegClassID: return Subtarget.is64Bit() ? 10 : 4; case X86::VR64RegClassID: return 4; } } const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); assert(MF && "MachineFunction required"); switch (MF->getFunction()->getCallingConv()) { case CallingConv::GHC: case CallingConv::HiPE: return CSR_NoRegs_SaveList; case CallingConv::AnyReg: if (HasAVX) return CSR_64_AllRegs_AVX_SaveList; return CSR_64_AllRegs_SaveList; case CallingConv::PreserveMost: return CSR_64_RT_MostRegs_SaveList; case CallingConv::PreserveAll: if (HasAVX) return CSR_64_RT_AllRegs_AVX_SaveList; return CSR_64_RT_AllRegs_SaveList; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX512_SaveList; if (HasAVX512 && Is64Bit) return CSR_64_Intel_OCL_BI_AVX512_SaveList; if (HasAVX && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX_SaveList; if (HasAVX && Is64Bit) return CSR_64_Intel_OCL_BI_AVX_SaveList; if (!HasAVX && !IsWin64 && Is64Bit) return CSR_64_Intel_OCL_BI_SaveList; break; } case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_SaveList; break; default: break; } bool CallsEHReturn = MF->getMMI().callsEHReturn(); if (Is64Bit) { if (IsWin64) return CSR_Win64_SaveList; if (CallsEHReturn) return CSR_64EHRet_SaveList; return CSR_64_SaveList; } if (CallsEHReturn) return CSR_32EHRet_SaveList; return CSR_32_SaveList; } const uint32_t* X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const { bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); switch (CC) { case CallingConv::GHC: case CallingConv::HiPE: return CSR_NoRegs_RegMask; case CallingConv::AnyReg: if (HasAVX) return CSR_64_AllRegs_AVX_RegMask; return CSR_64_AllRegs_RegMask; case CallingConv::PreserveMost: return CSR_64_RT_MostRegs_RegMask; case CallingConv::PreserveAll: if (HasAVX) return CSR_64_RT_AllRegs_AVX_RegMask; return CSR_64_RT_AllRegs_RegMask; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX512_RegMask; if (HasAVX512 && Is64Bit) return CSR_64_Intel_OCL_BI_AVX512_RegMask; if (HasAVX && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX_RegMask; if (HasAVX && Is64Bit) return CSR_64_Intel_OCL_BI_AVX_RegMask; if (!HasAVX && !IsWin64 && Is64Bit) return CSR_64_Intel_OCL_BI_RegMask; break; } case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_RegMask; break; default: break; } // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check // callsEHReturn(). if (Is64Bit) { if (IsWin64) return CSR_Win64_RegMask; return CSR_64_RegMask; } return CSR_32_RegMask; } const uint32_t* X86RegisterInfo::getNoPreservedMask() const { return CSR_NoRegs_RegMask; } BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); // Set the stack-pointer register and its aliases as reserved. for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); // Set the instruction pointer register and its aliases as reserved. for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); // Set the frame-pointer register and its aliases as reserved if needed. if (TFI->hasFP(MF)) { for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); } // Set the base-pointer register and its aliases as reserved if needed. if (hasBasePointer(MF)) { CallingConv::ID CC = MF.getFunction()->getCallingConv(); const uint32_t* RegMask = getCallPreservedMask(CC); if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) report_fatal_error( "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true); I.isValid(); ++I) Reserved.set(*I); } // Mark the segment registers as reserved. Reserved.set(X86::CS); Reserved.set(X86::SS); Reserved.set(X86::DS); Reserved.set(X86::ES); Reserved.set(X86::FS); Reserved.set(X86::GS); // Mark the floating point stack registers as reserved. for (unsigned n = 0; n != 8; ++n) Reserved.set(X86::ST0 + n); // Reserve the registers that only exist in 64-bit mode. if (!Is64Bit) { // These 8-bit registers are part of the x86-64 extension even though their // super-registers are old 32-bits. Reserved.set(X86::SIL); Reserved.set(X86::DIL); Reserved.set(X86::BPL); Reserved.set(X86::SPL); for (unsigned n = 0; n != 8; ++n) { // R8, R9, ... for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); // XMM8, XMM9, ... for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); } } if (!Is64Bit || !Subtarget.hasAVX512()) { for (unsigned n = 16; n != 32; ++n) { for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); } } return Reserved; } //===----------------------------------------------------------------------===// // Stack Frame Processing methods //===----------------------------------------------------------------------===// bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); if (!EnableBasePointer) return false; // When we need stack realignment, we can't address the stack from the frame // pointer. When we have dynamic allocas or stack-adjusting inline asm, we // can't address variables from the stack pointer. MS inline asm can // reference locals while also adjusting the stack pointer. When we can't // use both the SP and the FP, we need a separate base pointer register. bool CantUseFP = needsStackRealignment(MF); bool CantUseSP = MFI->hasVarSizedObjects() || MFI->hasInlineAsmWithSPAdjust(); return CantUseFP && CantUseSP; } bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { if (MF.getFunction()->hasFnAttribute("no-realign-stack")) return false; const MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); // Stack realignment requires a frame pointer. If we already started // register allocation with frame pointer elimination, it is too late now. if (!MRI->canReserveReg(FramePtr)) return false; // If a base pointer is necessary. Check that it isn't too late to reserve // it. if (MFI->hasVarSizedObjects()) return MRI->canReserveReg(BasePtr); return true; } bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const { const MachineFrameInfo *MFI = MF.getFrameInfo(); const Function *F = MF.getFunction(); unsigned StackAlign = MF.getSubtarget().getFrameLowering()->getStackAlignment(); bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) || F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::StackAlignment)); // If we've requested that we force align the stack do so now. if (ForceStackAlign) return canRealignStack(MF); return requiresRealignment && canRealignStack(MF); } bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg, int &FrameIdx) const { // Since X86 defines assignCalleeSavedSpillSlots which always return true // this function neither used nor tested. llvm_unreachable("Unused function on X86. Otherwise need a test case."); } void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { - assert(SPAdj == 0 && "Unexpected"); - MachineInstr &MI = *II; MachineFunction &MF = *MI.getParent()->getParent(); const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); unsigned BasePtr; unsigned Opc = MI.getOpcode(); bool AfterFPPop = Opc == X86::TAILJMPm64 || Opc == X86::TAILJMPm; if (hasBasePointer(MF)) BasePtr = (FrameIndex < 0 ? FramePtr : getBaseRegister()); else if (needsStackRealignment(MF)) BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr); else if (AfterFPPop) BasePtr = StackPtr; else BasePtr = (TFI->hasFP(MF) ? FramePtr : StackPtr); // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) BasePtr = getX86SubSuperRegister(BasePtr, MVT::i64, false); // This must be part of a four operand memory reference. Replace the // FrameIndex with base register with EBP. Add an offset to the offset. MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false); // Now add the frame object offset to the offset from EBP. int FIOffset; if (AfterFPPop) { // Tail call jmp happens after FP is popped. const MachineFrameInfo *MFI = MF.getFrameInfo(); FIOffset = MFI->getObjectOffset(FrameIndex) - TFI->getOffsetOfLocalArea(); } else FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex); + if (BasePtr == StackPtr) + FIOffset += SPAdj; + // The frame index format for stackmaps and patchpoints is different from the // X86 format. It only has a FI and an offset. if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) { assert(BasePtr == FramePtr && "Expected the FP as base register"); int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset; MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); return; } if (MI.getOperand(FIOperandNum+3).isImm()) { // Offset is a 32-bit integer. int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm()); int Offset = FIOffset + Imm; assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) && "Requesting 64-bit offset in 32-bit immediate!"); MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset); } else { // Offset is symbolic. This is extremely rare. uint64_t Offset = FIOffset + (uint64_t)MI.getOperand(FIOperandNum+3).getOffset(); MI.getOperand(FIOperandNum + 3).setOffset(Offset); } } unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); return TFI->hasFP(MF) ? FramePtr : StackPtr; } unsigned X86RegisterInfo::getPtrSizedFrameRegister( const MachineFunction &MF) const { unsigned FrameReg = getFrameRegister(MF); if (Subtarget.isTarget64BitILP32()) FrameReg = getX86SubSuperRegister(FrameReg, MVT::i32, false); return FrameReg; } namespace llvm { unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT, bool High) { switch (VT) { default: llvm_unreachable("Unexpected VT"); case MVT::i8: if (High) { switch (Reg) { default: return getX86SubSuperRegister(Reg, MVT::i64); case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: return X86::SI; case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: return X86::DI; case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: return X86::BP; case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: return X86::SP; case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AH; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: return X86::DH; case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: return X86::CH; case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: return X86::BH; } } else { switch (Reg) { default: llvm_unreachable("Unexpected register"); case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AL; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: return X86::DL; case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: return X86::CL; case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: return X86::BL; case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: return X86::SIL; case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: return X86::DIL; case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: return X86::BPL; case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: return X86::SPL; case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: return X86::R8B; case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: return X86::R9B; case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: return X86::R10B; case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: return X86::R11B; case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: return X86::R12B; case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: return X86::R13B; case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: return X86::R14B; case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: return X86::R15B; } } case MVT::i16: switch (Reg) { default: llvm_unreachable("Unexpected register"); case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::AX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: return X86::DX; case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: return X86::CX; case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: return X86::BX; case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: return X86::SI; case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: return X86::DI; case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: return X86::BP; case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: return X86::SP; case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: return X86::R8W; case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: return X86::R9W; case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: return X86::R10W; case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: return X86::R11W; case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: return X86::R12W; case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: return X86::R13W; case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: return X86::R14W; case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: return X86::R15W; } case MVT::i32: switch (Reg) { default: llvm_unreachable("Unexpected register"); case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::EAX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: return X86::EDX; case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: return X86::ECX; case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: return X86::EBX; case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: return X86::ESI; case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: return X86::EDI; case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: return X86::EBP; case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: return X86::ESP; case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: return X86::R8D; case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: return X86::R9D; case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: return X86::R10D; case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: return X86::R11D; case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: return X86::R12D; case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: return X86::R13D; case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: return X86::R14D; case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: return X86::R15D; } case MVT::i64: switch (Reg) { default: llvm_unreachable("Unexpected register"); case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX: return X86::RAX; case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX: return X86::RDX; case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX: return X86::RCX; case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX: return X86::RBX; case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI: return X86::RSI; case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI: return X86::RDI; case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP: return X86::RBP; case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP: return X86::RSP; case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8: return X86::R8; case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9: return X86::R9; case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10: return X86::R10; case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11: return X86::R11; case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12: return X86::R12; case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13: return X86::R13; case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14: return X86::R14; case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15: return X86::R15; } } } unsigned get512BitSuperRegister(unsigned Reg) { if (Reg >= X86::XMM0 && Reg <= X86::XMM31) return X86::ZMM0 + (Reg - X86::XMM0); if (Reg >= X86::YMM0 && Reg <= X86::YMM31) return X86::ZMM0 + (Reg - X86::YMM0); if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31) return Reg; llvm_unreachable("Unexpected SIMD register"); } } diff --git a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp index 5e6aa7d3dbf4..1fc6b20eab09 100644 --- a/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/contrib/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -1,205 +1,210 @@ //===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the X86 specific subclass of TargetMachine. // //===----------------------------------------------------------------------===// #include "X86TargetMachine.h" #include "X86.h" #include "X86TargetObjectFile.h" #include "llvm/CodeGen/Passes.h" #include "llvm/IR/Function.h" #include "llvm/PassManager.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FormattedStream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; extern "C" void LLVMInitializeX86Target() { // Register the target. RegisterTargetMachine X(TheX86_32Target); RegisterTargetMachine Y(TheX86_64Target); } static std::unique_ptr createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) { if (TT.getArch() == Triple::x86_64) return make_unique(); return make_unique(); } if (TT.isOSLinux()) return make_unique(); if (TT.isOSBinFormatELF()) return make_unique(); if (TT.isKnownWindowsMSVCEnvironment()) return make_unique(); if (TT.isOSBinFormatCOFF()) return make_unique(); llvm_unreachable("unknown subtarget type"); } /// X86TargetMachine ctor - Create an X86 target. /// X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL), TLOF(createTLOF(Triple(getTargetTriple()))), Subtarget(TT, CPU, FS, *this, Options.StackAlignmentOverride) { // default to hard float ABI if (Options.FloatABIType == FloatABI::Default) this->Options.FloatABIType = FloatABI::Hard; // Windows stack unwinder gets confused when execution flow "falls through" // after a call to 'noreturn' function. // To prevent that, we emit a trap for 'unreachable' IR instructions. // (which on X86, happens to be the 'ud2' instruction) if (Subtarget.isTargetWin64()) this->Options.TrapUnreachable = true; initAsmInfo(); } X86TargetMachine::~X86TargetMachine() {} const X86Subtarget * X86TargetMachine::getSubtargetImpl(const Function &F) const { AttributeSet FnAttrs = F.getAttributes(); Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu"); Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features"); std::string CPU = !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString().str() : TargetCPU; std::string FS = !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString().str() : TargetFS; // FIXME: This is related to the code below to reset the target options, // we need to know whether or not the soft float flag is set on the // function before we can generate a subtarget. We also need to use // it as a key for the subtarget since that can be the only difference // between two functions. Attribute SFAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex, "use-soft-float"); bool SoftFloat = !SFAttr.hasAttribute(Attribute::None) ? SFAttr.getValueAsString() == "true" : Options.UseSoftFloat; auto &I = SubtargetMap[CPU + FS + (SoftFloat ? "use-soft-float=true" : "use-soft-float=false")]; if (!I) { // This needs to be done before we create a new subtarget since any // creation will depend on the TM and the code generation flags on the // function that reside in TargetOptions. resetTargetOptions(F); I = llvm::make_unique(TargetTriple, CPU, FS, *this, Options.StackAlignmentOverride); } return I.get(); } //===----------------------------------------------------------------------===// // Command line options for x86 //===----------------------------------------------------------------------===// static cl::opt UseVZeroUpper("x86-use-vzeroupper", cl::Hidden, cl::desc("Minimize AVX to SSE transition penalty"), cl::init(true)); //===----------------------------------------------------------------------===// // X86 Analysis Pass Setup //===----------------------------------------------------------------------===// void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) { // Add first the target-independent BasicTTI pass, then our X86 pass. This // allows the X86 pass to delegate to the target independent layer when // appropriate. PM.add(createBasicTargetTransformInfoPass(this)); PM.add(createX86TargetTransformInfoPass(this)); } //===----------------------------------------------------------------------===// // Pass Pipeline Configuration //===----------------------------------------------------------------------===// namespace { /// X86 Code Generator Pass Configuration Options. class X86PassConfig : public TargetPassConfig { public: X86PassConfig(X86TargetMachine *TM, PassManagerBase &PM) : TargetPassConfig(TM, PM) {} X86TargetMachine &getX86TargetMachine() const { return getTM(); } const X86Subtarget &getX86Subtarget() const { return *getX86TargetMachine().getSubtargetImpl(); } void addIRPasses() override; bool addInstSelector() override; bool addILPOpts() override; + void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; }; } // namespace TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) { return new X86PassConfig(this, PM); } void X86PassConfig::addIRPasses() { addPass(createAtomicExpandPass(&getX86TargetMachine())); TargetPassConfig::addIRPasses(); } bool X86PassConfig::addInstSelector() { // Install an instruction selector. addPass(createX86ISelDag(getX86TargetMachine(), getOptLevel())); // For ELF, cleanup any local-dynamic TLS accesses. if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None) addPass(createCleanupLocalDynamicTLSPass()); addPass(createX86GlobalBaseRegPass()); return false; } bool X86PassConfig::addILPOpts() { addPass(&EarlyIfConverterID); return true; } +void X86PassConfig::addPreRegAlloc() { + addPass(createX86CallFrameOptimization()); +} + void X86PassConfig::addPostRegAlloc() { addPass(createX86FloatingPointStackifierPass()); } void X86PassConfig::addPreEmitPass() { if (getOptLevel() != CodeGenOpt::None && getX86Subtarget().hasSSE2()) addPass(createExecutionDependencyFixPass(&X86::VR128RegClass)); if (UseVZeroUpper) addPass(createX86IssueVZeroUpperPass()); if (getOptLevel() != CodeGenOpt::None) { addPass(createX86PadShortFunctions()); addPass(createX86FixupLEAs()); } } diff --git a/lib/clang/libllvmx86codegen/Makefile b/lib/clang/libllvmx86codegen/Makefile index 58278521a7a2..b030b1fc2e98 100644 --- a/lib/clang/libllvmx86codegen/Makefile +++ b/lib/clang/libllvmx86codegen/Makefile @@ -1,35 +1,36 @@ # $FreeBSD$ .include LIB= llvmx86codegen SRCDIR= lib/Target/X86 SRCS= X86AsmPrinter.cpp \ + X86CallFrameOptimization.cpp \ X86FastISel.cpp \ X86FixupLEAs.cpp \ X86FloatingPoint.cpp \ X86FrameLowering.cpp \ X86ISelDAGToDAG.cpp \ X86ISelLowering.cpp \ X86InstrInfo.cpp \ X86MCInstLower.cpp \ X86MachineFunctionInfo.cpp \ X86PadShortFunction.cpp \ X86RegisterInfo.cpp \ X86SelectionDAGInfo.cpp \ X86Subtarget.cpp \ X86TargetMachine.cpp \ X86TargetObjectFile.cpp \ X86TargetTransformInfo.cpp \ X86VZeroUpper.cpp TGHDRS= Intrinsics \ X86GenCallingConv \ X86GenDAGISel \ X86GenFastISel \ X86GenInstrInfo \ X86GenRegisterInfo \ X86GenSubtargetInfo .include "../clang.lib.mk"